summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig249
-rw-r--r--arch/x86/Kconfig.assembler12
-rw-r--r--arch/x86/Kconfig.debug5
-rw-r--r--arch/x86/Makefile33
-rw-r--r--arch/x86/Makefile.um1
-rw-r--r--arch/x86/boot/Makefile18
-rw-r--r--arch/x86/boot/compressed/Makefile15
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/boot/compressed/kaslr.c43
-rw-r--r--arch/x86/boot/compressed/misc.c7
-rw-r--r--arch/x86/boot/compressed/misc.h2
-rw-r--r--arch/x86/boot/compressed/sev.c263
-rw-r--r--arch/x86/boot/cpucheck.c2
-rwxr-xr-xarch/x86/boot/install.sh2
-rw-r--r--arch/x86/boot/main.c42
-rw-r--r--arch/x86/boot/printf.c3
-rw-r--r--arch/x86/coco/Makefile1
-rw-r--r--arch/x86/coco/core.c1
-rw-r--r--arch/x86/coco/sev/Makefile15
-rw-r--r--arch/x86/coco/sev/core.c (renamed from arch/x86/kernel/sev.c)465
-rw-r--r--arch/x86/coco/sev/shared.c (renamed from arch/x86/kernel/sev-shared.c)460
-rw-r--r--arch/x86/coco/tdx/tdx.c128
-rw-r--r--arch/x86/configs/hardening.config3
-rw-r--r--arch/x86/configs/tiny.config4
-rw-r--r--arch/x86/crypto/Kconfig9
-rw-r--r--arch/x86/crypto/Makefile7
-rw-r--r--arch/x86/crypto/aes-gcm-aesni-x86_64.S1128
-rw-r--r--arch/x86/crypto/aes-gcm-avx10-x86_64.S1222
-rw-r--r--arch/x86/crypto/aes-xts-avx-x86_64.S845
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1970
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S2804
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c1586
-rw-r--r--arch/x86/crypto/camellia_glue.c2
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c1
-rw-r--r--arch/x86/crypto/curve25519-x86_64.c1
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c2
-rw-r--r--arch/x86/crypto/nh-avx2-x86_64.S1
-rw-r--r--arch/x86/crypto/poly1305_glue.c4
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S17
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S253
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S1
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c9
-rw-r--r--arch/x86/entry/Makefile2
-rw-r--r--arch/x86/entry/common.c2
-rw-r--r--arch/x86/entry/entry_64_compat.S15
-rw-r--r--arch/x86/entry/entry_fred.c2
-rw-r--r--arch/x86/entry/syscall_32.c10
-rw-r--r--arch/x86/entry/syscall_64.c9
-rw-r--r--arch/x86/entry/syscall_x32.c7
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl10
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl11
-rw-r--r--arch/x86/entry/thunk.S (renamed from arch/x86/entry/thunk_64.S)0
-rw-r--r--arch/x86/entry/thunk_32.S18
-rw-r--r--arch/x86/entry/vdso/Makefile31
-rw-r--r--arch/x86/entry/vdso/vdso.lds.S2
-rw-r--r--arch/x86/entry/vdso/vgetrandom-chacha.S178
-rw-r--r--arch/x86/entry/vdso/vgetrandom.c15
-rw-r--r--arch/x86/entry/vdso/vma.c3
-rw-r--r--arch/x86/events/amd/core.c65
-rw-r--r--arch/x86/events/amd/lbr.c13
-rw-r--r--arch/x86/events/amd/uncore.c36
-rw-r--r--arch/x86/events/core.c194
-rw-r--r--arch/x86/events/intel/bts.c3
-rw-r--r--arch/x86/events/intel/core.c682
-rw-r--r--arch/x86/events/intel/cstate.c335
-rw-r--r--arch/x86/events/intel/ds.c180
-rw-r--r--arch/x86/events/intel/knc.c2
-rw-r--r--arch/x86/events/intel/lbr.c3
-rw-r--r--arch/x86/events/intel/p4.c10
-rw-r--r--arch/x86/events/intel/p6.c2
-rw-r--r--arch/x86/events/intel/pt.c45
-rw-r--r--arch/x86/events/intel/pt.h4
-rw-r--r--arch/x86/events/intel/uncore.c207
-rw-r--r--arch/x86/events/intel/uncore.h10
-rw-r--r--arch/x86/events/intel/uncore_discovery.c306
-rw-r--r--arch/x86/events/intel/uncore_discovery.h22
-rw-r--r--arch/x86/events/intel/uncore_nhmex.c3
-rw-r--r--arch/x86/events/intel/uncore_snb.c185
-rw-r--r--arch/x86/events/intel/uncore_snbep.c139
-rw-r--r--arch/x86/events/msr.c132
-rw-r--r--arch/x86/events/perf_event.h111
-rw-r--r--arch/x86/events/rapl.c149
-rw-r--r--arch/x86/events/zhaoxin/core.c12
-rw-r--r--arch/x86/hyperv/hv_init.c5
-rw-r--r--arch/x86/hyperv/hv_vtl.c1
-rw-r--r--arch/x86/hyperv/ivm.c28
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/acpi.h17
-rw-r--r--arch/x86/include/asm/alternative.h255
-rw-r--r--arch/x86/include/asm/amd_nb.h4
-rw-r--r--arch/x86/include/asm/apic.h56
-rw-r--r--arch/x86/include/asm/asm.h3
-rw-r--r--arch/x86/include/asm/atomic.h12
-rw-r--r--arch/x86/include/asm/atomic64_32.h87
-rw-r--r--arch/x86/include/asm/atomic64_64.h12
-rw-r--r--arch/x86/include/asm/barrier.h24
-rw-r--r--arch/x86/include/asm/bitops.h10
-rw-r--r--arch/x86/include/asm/boot.h5
-rw-r--r--arch/x86/include/asm/bug.h12
-rw-r--r--arch/x86/include/asm/cfi.h2
-rw-r--r--arch/x86/include/asm/cmdline.h4
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h203
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h8
-rw-r--r--arch/x86/include/asm/cpu_device_id.h116
-rw-r--r--arch/x86/include/asm/cpufeature.h16
-rw-r--r--arch/x86/include/asm/cpufeatures.h803
-rw-r--r--arch/x86/include/asm/cpuid.h8
-rw-r--r--arch/x86/include/asm/efi.h23
-rw-r--r--arch/x86/include/asm/entry-common.h28
-rw-r--r--arch/x86/include/asm/extable.h1
-rw-r--r--arch/x86/include/asm/extable_fixup_types.h2
-rw-r--r--arch/x86/include/asm/fb.h19
-rw-r--r--arch/x86/include/asm/fpu.h13
-rw-r--r--arch/x86/include/asm/fpu/api.h3
-rw-r--r--arch/x86/include/asm/fpu/signal.h2
-rw-r--r--arch/x86/include/asm/fpu/types.h13
-rw-r--r--arch/x86/include/asm/fred.h23
-rw-r--r--arch/x86/include/asm/ftrace.h4
-rw-r--r--arch/x86/include/asm/hardirq.h14
-rw-r--r--arch/x86/include/asm/ia32.h11
-rw-r--r--arch/x86/include/asm/ia32_unistd.h12
-rw-r--r--arch/x86/include/asm/idtentry.h12
-rw-r--r--arch/x86/include/asm/inat.h17
-rw-r--r--arch/x86/include/asm/init.h3
-rw-r--r--arch/x86/include/asm/insn.h32
-rw-r--r--arch/x86/include/asm/intel-family.h174
-rw-r--r--arch/x86/include/asm/intel_ds.h1
-rw-r--r--arch/x86/include/asm/intel_pconfig.h65
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h68
-rw-r--r--arch/x86/include/asm/intel_telemetry.h2
-rw-r--r--arch/x86/include/asm/io.h18
-rw-r--r--arch/x86/include/asm/irq_remapping.h7
-rw-r--r--arch/x86/include/asm/irq_stack.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h12
-rw-r--r--arch/x86/include/asm/irqflags.h20
-rw-r--r--arch/x86/include/asm/kexec.h13
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h15
-rw-r--r--arch/x86/include/asm/kvm-x86-pmu-ops.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h189
-rw-r--r--arch/x86/include/asm/mce.h7
-rw-r--r--arch/x86/include/asm/mmu_context.h13
-rw-r--r--arch/x86/include/asm/mmzone.h6
-rw-r--r--arch/x86/include/asm/mmzone_32.h17
-rw-r--r--arch/x86/include/asm/mmzone_64.h18
-rw-r--r--arch/x86/include/asm/mpspec.h6
-rw-r--r--arch/x86/include/asm/mshyperv.h1
-rw-r--r--arch/x86/include/asm/msr-index.h56
-rw-r--r--arch/x86/include/asm/msr.h25
-rw-r--r--arch/x86/include/asm/mtrr.h2
-rw-r--r--arch/x86/include/asm/numa.h26
-rw-r--r--arch/x86/include/asm/page_64.h3
-rw-r--r--arch/x86/include/asm/page_types.h8
-rw-r--r--arch/x86/include/asm/percpu.h616
-rw-r--r--arch/x86/include/asm/perf_event.h8
-rw-r--r--arch/x86/include/asm/pgtable.h175
-rw-r--r--arch/x86/include/asm/pgtable_64.h23
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/pgtable_types.h5
-rw-r--r--arch/x86/include/asm/posted_intr.h118
-rw-r--r--arch/x86/include/asm/processor.h48
-rw-r--r--arch/x86/include/asm/prom.h9
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/qspinlock.h25
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h7
-rw-r--r--arch/x86/include/asm/reboot.h6
-rw-r--r--arch/x86/include/asm/resctrl.h6
-rw-r--r--arch/x86/include/asm/runtime-const.h61
-rw-r--r--arch/x86/include/asm/seccomp.h2
-rw-r--r--arch/x86/include/asm/set_memory.h3
-rw-r--r--arch/x86/include/asm/setup.h8
-rw-r--r--arch/x86/include/asm/sev-common.h51
-rw-r--r--arch/x86/include/asm/sev.h188
-rw-r--r--arch/x86/include/asm/shstk.h4
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/sparsemem.h11
-rw-r--r--arch/x86/include/asm/special_insns.h8
-rw-r--r--arch/x86/include/asm/string_64.h45
-rw-r--r--arch/x86/include/asm/svm.h29
-rw-r--r--arch/x86/include/asm/switch_to.h6
-rw-r--r--arch/x86/include/asm/syscall.h7
-rw-r--r--arch/x86/include/asm/text-patching.h2
-rw-r--r--arch/x86/include/asm/tlbflush.h9
-rw-r--r--arch/x86/include/asm/topology.h13
-rw-r--r--arch/x86/include/asm/tsc.h3
-rw-r--r--arch/x86/include/asm/uaccess.h6
-rw-r--r--arch/x86/include/asm/uaccess_64.h11
-rw-r--r--arch/x86/include/asm/unistd.h1
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h1
-rw-r--r--arch/x86/include/asm/vdso/getrandom.h42
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h43
-rw-r--r--arch/x86/include/asm/vdso/vsyscall.h9
-rw-r--r--arch/x86/include/asm/vgtod.h5
-rw-r--r--arch/x86/include/asm/video.h21
-rw-r--r--arch/x86/include/asm/vm86.h2
-rw-r--r--arch/x86/include/asm/vmware.h336
-rw-r--r--arch/x86/include/asm/vmx.h53
-rw-r--r--arch/x86/include/asm/vmxfeatures.h110
-rw-r--r--arch/x86/include/asm/vvar.h16
-rw-r--r--arch/x86/include/asm/word-at-a-time.h57
-rw-r--r--arch/x86/include/asm/x86_init.h14
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h5
-rw-r--r--arch/x86/include/uapi/asm/elf.h16
-rw-r--r--arch/x86/include/uapi/asm/kvm.h72
-rw-r--r--arch/x86/include/uapi/asm/svm.h1
-rw-r--r--arch/x86/kernel/Makefile18
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c97
-rw-r--r--arch/x86/kernel/acpi/cppc.c172
-rw-r--r--arch/x86/kernel/acpi/madt_playdead.S28
-rw-r--r--arch/x86/kernel/acpi/madt_wakeup.c292
-rw-r--r--arch/x86/kernel/alternative.c161
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/amd_nb.c61
-rw-r--r--arch/x86/kernel/apic/apic.c136
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c119
-rw-r--r--arch/x86/kernel/apic/io_apic.c749
-rw-r--r--arch/x86/kernel/apic/msi.c5
-rw-r--r--arch/x86/kernel/apic/vector.c14
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c7
-rw-r--r--arch/x86/kernel/callthunks.c9
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c41
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c113
-rw-r--r--arch/x86/kernel/cpu/bugs.c106
-rw-r--r--arch/x86/kernel/cpu/common.c203
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c4
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c230
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c12
-rw-r--r--arch/x86/kernel/cpu/intel_pconfig.c84
-rw-r--r--arch/x86/kernel/cpu/match.c9
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c18
-rw-r--r--arch/x86/kernel/cpu/mce/core.c69
-rw-r--r--arch/x86/kernel/cpu/mce/dev-mcelog.c3
-rw-r--r--arch/x86/kernel/cpu/mce/genpool.c40
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c9
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c21
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h2
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c27
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c194
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c5
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh3
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c32
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c8
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c379
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c127
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h111
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c262
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c67
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c298
-rw-r--r--arch/x86/kernel/cpu/resctrl/trace.h (renamed from arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h)24
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c41
-rw-r--r--arch/x86/kernel/cpu/topology.c53
-rw-r--r--arch/x86/kernel/cpu/topology_amd.c37
-rw-r--r--arch/x86/kernel/cpu/topology_ext.c15
-rw-r--r--arch/x86/kernel/cpu/vmware.c225
-rw-r--r--arch/x86/kernel/crash.c44
-rw-r--r--arch/x86/kernel/devicetree.c26
-rw-r--r--arch/x86/kernel/dumpstack.c4
-rw-r--r--arch/x86/kernel/e820.c9
-rw-r--r--arch/x86/kernel/early-quirks.c85
-rw-r--r--arch/x86/kernel/eisa.c8
-rw-r--r--arch/x86/kernel/fpu/core.c4
-rw-r--r--arch/x86/kernel/fpu/signal.c27
-rw-r--r--arch/x86/kernel/fpu/xstate.c110
-rw-r--r--arch/x86/kernel/fpu/xstate.h20
-rw-r--r--arch/x86/kernel/fred.c45
-rw-r--r--arch/x86/kernel/ftrace.c16
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_32.S13
-rw-r--r--arch/x86/kernel/head_64.S24
-rw-r--r--arch/x86/kernel/i8253.c11
-rw-r--r--arch/x86/kernel/idt.c3
-rw-r--r--arch/x86/kernel/irq.c172
-rw-r--r--arch/x86/kernel/irq_64.c1
-rw-r--r--arch/x86/kernel/itmt.c2
-rw-r--r--arch/x86/kernel/jailhouse.c1
-rw-r--r--arch/x86/kernel/kprobes/core.c4
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c3
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c38
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c1
-rw-r--r--arch/x86/kernel/module.c51
-rw-r--r--arch/x86/kernel/mpparse.c13
-rw-r--r--arch/x86/kernel/paravirt.c7
-rw-r--r--arch/x86/kernel/process.c7
-rw-r--r--arch/x86/kernel/process_64.c42
-rw-r--r--arch/x86/kernel/reboot.c22
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S37
-rw-r--r--arch/x86/kernel/rtc.c1
-rw-r--r--arch/x86/kernel/setup.c54
-rw-r--r--arch/x86/kernel/shstk.c20
-rw-r--r--arch/x86/kernel/signal.c29
-rw-r--r--arch/x86/kernel/signal_32.c2
-rw-r--r--arch/x86/kernel/signal_64.c12
-rw-r--r--arch/x86/kernel/smpboot.c31
-rw-r--r--arch/x86/kernel/sys_x86_64.c29
-rw-r--r--arch/x86/kernel/time.c20
-rw-r--r--arch/x86/kernel/topology.c43
-rw-r--r--arch/x86/kernel/traps.c87
-rw-r--r--arch/x86/kernel/tsc.c110
-rw-r--r--arch/x86/kernel/tsc_msr.c14
-rw-r--r--arch/x86/kernel/tsc_sync.c6
-rw-r--r--arch/x86/kernel/uprobes.c124
-rw-r--r--arch/x86/kernel/vmlinux.lds.S12
-rw-r--r--arch/x86/kernel/x86_init.c9
-rw-r--r--arch/x86/kvm/Kconfig38
-rw-r--r--arch/x86/kvm/Makefile11
-rw-r--r--arch/x86/kvm/cpuid.c87
-rw-r--r--arch/x86/kvm/cpuid.h18
-rw-r--r--arch/x86/kvm/emulate.c73
-rw-r--r--arch/x86/kvm/hyperv.c9
-rw-r--r--arch/x86/kvm/hyperv.h1
-rw-r--r--arch/x86/kvm/irq.c12
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/irq_comm.c7
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h10
-rw-r--r--arch/x86/kvm/kvm_emulate.h2
-rw-r--r--arch/x86/kvm/lapic.c197
-rw-r--r--arch/x86/kvm/lapic.h10
-rw-r--r--arch/x86/kvm/mmu.h49
-rw-r--r--arch/x86/kvm/mmu/mmu.c1118
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h57
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h3
-rw-r--r--arch/x86/kvm/mmu/page_track.c2
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h92
-rw-r--r--arch/x86/kvm/mmu/spte.c92
-rw-r--r--arch/x86/kvm/mmu/spte.h45
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h2
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c271
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h3
-rw-r--r--arch/x86/kvm/mtrr.c644
-rw-r--r--arch/x86/kvm/pmu.c73
-rw-r--r--arch/x86/kvm/pmu.h10
-rw-r--r--arch/x86/kvm/reverse_cpuid.h8
-rw-r--r--arch/x86/kvm/smm.c68
-rw-r--r--arch/x86/kvm/svm/nested.c6
-rw-r--r--arch/x86/kvm/svm/pmu.c11
-rw-r--r--arch/x86/kvm/svm/sev.c1940
-rw-r--r--arch/x86/kvm/svm/svm.c280
-rw-r--r--arch/x86/kvm/svm/svm.h142
-rw-r--r--arch/x86/kvm/svm/vmenter.S8
-rw-r--r--arch/x86/kvm/trace.h61
-rw-r--r--arch/x86/kvm/vmx/capabilities.h10
-rw-r--r--arch/x86/kvm/vmx/main.c168
-rw-r--r--arch/x86/kvm/vmx/nested.c224
-rw-r--r--arch/x86/kvm/vmx/nested.h8
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c52
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c4
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h101
-rw-r--r--arch/x86/kvm/vmx/sgx.c2
-rw-r--r--arch/x86/kvm/vmx/vmcs.h5
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h14
-rw-r--r--arch/x86/kvm/vmx/vmx.c708
-rw-r--r--arch/x86/kvm/vmx/vmx.h20
-rw-r--r--arch/x86/kvm/vmx/vmx_onhyperv.h8
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h2
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h121
-rw-r--r--arch/x86/kvm/x86.c1825
-rw-r--r--arch/x86/kvm/x86.h58
-rw-r--r--arch/x86/kvm/xen.c6
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S9
-rw-r--r--arch/x86/lib/cmdline.c25
-rw-r--r--arch/x86/lib/copy_mc.c21
-rw-r--r--arch/x86/lib/getuser.S67
-rw-r--r--arch/x86/lib/insn.c31
-rw-r--r--arch/x86/lib/iomap_copy_64.S15
-rw-r--r--arch/x86/lib/iomem.c5
-rw-r--r--arch/x86/lib/x86-opcode-map.txt315
-rw-r--r--arch/x86/math-emu/fpu_etc.c9
-rw-r--r--arch/x86/math-emu/fpu_trig.c6
-rw-r--r--arch/x86/math-emu/reg_constant.c7
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/amdtopology.c1
-rw-r--r--arch/x86/mm/cpu_entry_area.c2
-rw-r--r--arch/x86/mm/extable.c9
-rw-r--r--arch/x86/mm/fault.c31
-rw-r--r--arch/x86/mm/hugetlbpage.c35
-rw-r--r--arch/x86/mm/ident_map.c96
-rw-r--r--arch/x86/mm/init.c92
-rw-r--r--arch/x86/mm/init_64.c34
-rw-r--r--arch/x86/mm/ioremap.c3
-rw-r--r--arch/x86/mm/kaslr.c32
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c16
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/numa.c622
-rw-r--r--arch/x86/mm/numa_32.c1
-rw-r--r--arch/x86/mm/numa_emulation.c585
-rw-r--r--arch/x86/mm/numa_internal.h24
-rw-r--r--arch/x86/mm/pat/memtype.c67
-rw-r--r--arch/x86/mm/pat/set_memory.c139
-rw-r--r--arch/x86/mm/pgtable.c26
-rw-r--r--arch/x86/mm/pti.c51
-rw-r--r--arch/x86/mm/srat.c6
-rw-r--r--arch/x86/mm/testmmiotrace.c1
-rw-r--r--arch/x86/mm/tlb.c19
-rw-r--r--arch/x86/net/bpf_jit_comp.c278
-rw-r--r--arch/x86/net/bpf_jit_comp32.c3
-rw-r--r--arch/x86/pci/ce4100.c6
-rw-r--r--arch/x86/pci/fixup.c4
-rw-r--r--arch/x86/pci/intel_mid_pci.c8
-rw-r--r--arch/x86/pci/mmconfig-shared.c40
-rw-r--r--arch/x86/pci/olpc.c3
-rw-r--r--arch/x86/pci/xen.c4
-rw-r--r--arch/x86/platform/atom/punit_atom_debug.c11
-rw-r--r--arch/x86/platform/ce4100/ce4100.c1
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi.c2
-rw-r--r--arch/x86/platform/efi/fake_mem.c197
-rw-r--r--arch/x86/platform/efi/memmap.c13
-rw-r--r--arch/x86/platform/geode/Makefile1
-rw-r--r--arch/x86/platform/geode/alix.c82
-rw-r--r--arch/x86/platform/geode/geode-common.c178
-rw-r--r--arch/x86/platform/geode/geode-common.h21
-rw-r--r--arch/x86/platform/geode/geos.c80
-rw-r--r--arch/x86/platform/geode/net5501.c69
-rw-r--r--arch/x86/platform/intel-mid/intel-mid.c9
-rw-r--r--arch/x86/platform/intel/iosf_mbi.c4
-rw-r--r--arch/x86/platform/iris/iris.c5
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-pm.c7
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-sci.c5
-rw-r--r--arch/x86/platform/pvh/Makefile1
-rw-r--r--arch/x86/platform/pvh/enlighten.c9
-rw-r--r--arch/x86/platform/pvh/head.S161
-rw-r--r--arch/x86/purgatory/Makefile12
-rw-r--r--arch/x86/realmode/rm/Makefile11
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk15
-rw-r--r--arch/x86/tools/relocs.c371
-rw-r--r--arch/x86/um/Makefile5
-rw-r--r--arch/x86/um/asm/mm_context.h70
-rw-r--r--arch/x86/um/asm/ptrace.h6
-rw-r--r--arch/x86/um/bugs_32.c1
-rw-r--r--arch/x86/um/bugs_64.c1
-rw-r--r--arch/x86/um/elfcore.c1
-rw-r--r--arch/x86/um/fault.c1
-rw-r--r--arch/x86/um/ldt.c380
-rw-r--r--arch/x86/um/os-Linux/mcontext.c1
-rw-r--r--arch/x86/um/os-Linux/registers.c2
-rw-r--r--arch/x86/um/os-Linux/tls.c1
-rw-r--r--arch/x86/um/ptrace_32.c2
-rw-r--r--arch/x86/um/shared/sysdep/archsetjmp.h7
-rw-r--r--arch/x86/um/shared/sysdep/kernel-offsets.h3
-rw-r--r--arch/x86/um/shared/sysdep/stub.h2
-rw-r--r--arch/x86/um/shared/sysdep/stub_32.h45
-rw-r--r--arch/x86/um/shared/sysdep/stub_64.h41
-rw-r--r--arch/x86/um/signal.c14
-rw-r--r--arch/x86/um/stub_32.S56
-rw-r--r--arch/x86/um/stub_64.S50
-rw-r--r--arch/x86/um/sys_call_table_32.c10
-rw-r--r--arch/x86/um/sys_call_table_64.c11
-rw-r--r--arch/x86/um/sysrq_32.c1
-rw-r--r--arch/x86/um/sysrq_64.c1
-rw-r--r--arch/x86/um/tls_32.c20
-rw-r--r--arch/x86/um/user-offsets.c3
-rw-r--r--arch/x86/um/vdso/Makefile9
-rw-r--r--arch/x86/um/vdso/um_vdso.c10
-rw-r--r--arch/x86/um/vdso/vma.c12
-rw-r--r--arch/x86/video/Makefile3
-rw-r--r--arch/x86/video/video-common.c (renamed from arch/x86/video/fbdev.c)21
-rw-r--r--arch/x86/virt/svm/sev.c44
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.c9
-rw-r--r--arch/x86/xen/apic.c2
-rw-r--r--arch/x86/xen/debugfs.c2
-rw-r--r--arch/x86/xen/debugfs.h7
-rw-r--r--arch/x86/xen/enlighten.c38
-rw-r--r--arch/x86/xen/enlighten_hvm.c2
-rw-r--r--arch/x86/xen/enlighten_pv.c4
-rw-r--r--arch/x86/xen/enlighten_pvh.c118
-rw-r--r--arch/x86/xen/mmu.c3
-rw-r--r--arch/x86/xen/mmu.h28
-rw-r--r--arch/x86/xen/mmu_hvm.c2
-rw-r--r--arch/x86/xen/mmu_pv.c27
-rw-r--r--arch/x86/xen/multicalls.c133
-rw-r--r--arch/x86/xen/multicalls.h69
-rw-r--r--arch/x86/xen/p2m.c115
-rw-r--r--arch/x86/xen/pmu.c1
-rw-r--r--arch/x86/xen/pmu.h22
-rw-r--r--arch/x86/xen/setup.c204
-rw-r--r--arch/x86/xen/smp.c1
-rw-r--r--arch/x86/xen/smp.h51
-rw-r--r--arch/x86/xen/smp_hvm.c2
-rw-r--r--arch/x86/xen/smp_pv.c4
-rw-r--r--arch/x86/xen/spinlock.c20
-rw-r--r--arch/x86/xen/suspend.c2
-rw-r--r--arch/x86/xen/time.c2
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--arch/x86/xen/xen-ops.h157
493 files changed, 22583 insertions, 18146 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 928820e61cb5..2852fcd82cbd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,12 +28,14 @@ config X86_64
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
select ARCH_SUPPORTS_PER_VMA_LOCK
+ select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE
select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
select SWIOTLB
select ARCH_HAS_ELFCORE_COMPAT
select ZONE_DMA32
+ select EXECMEM if DYNAMIC_FTRACE
config FORCE_DYNAMIC_FTRACE
def_bool y
@@ -78,12 +80,14 @@ config X86
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
select ARCH_HAS_DEVMEM_IS_ALLOWED
+ select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN
select ARCH_HAS_EARLY_DEBUG if KGDB
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV if X86_64
+ select ARCH_HAS_KERNEL_FPU_SUPPORT
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
@@ -105,6 +109,7 @@ config X86
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
+ select ARCH_HAVE_EXTRA_ELF_NOTES
select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
@@ -120,6 +125,7 @@ config X86
select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG
select ARCH_SUPPORTS_LTO_CLANG
select ARCH_SUPPORTS_LTO_CLANG_THIN
+ select ARCH_SUPPORTS_RT
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64
select ARCH_USE_MEMTEST
@@ -169,6 +175,7 @@ config X86
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
select GENERIC_VDSO_TIME_NS
+ select GENERIC_VDSO_OVERFLOW_PROTECT
select GUP_GET_PXX_LOW_HIGH if X86_PAE
select HARDIRQS_SW_RESEND
select HARDLOCKUP_CHECK_TIMESTAMP if X86_64
@@ -222,7 +229,7 @@ config X86
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EISA
select HAVE_EXIT_THREAD
- select HAVE_FAST_GUP
+ select HAVE_GUP_FAST
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
@@ -284,6 +291,7 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
select HAVE_GENERIC_VDSO
+ select VDSO_GETRANDOM if X86_64
select HOTPLUG_PARALLEL if SMP && X86_64
select HOTPLUG_SMT if SMP
select HOTPLUG_SPLIT_STARTUP if SMP && X86_32
@@ -292,6 +300,7 @@ config X86
select NEED_PER_CPU_EMBED_FIRST_CHUNK
select NEED_PER_CPU_PAGE_FIRST_CHUNK
select NEED_SG_DMA_LENGTH
+ select NUMA_MEMBLKS if NUMA
select PCI_DOMAINS if PCI
select PCI_LOCKLESS_CONFIG if PCI
select PERF_EVENTS
@@ -465,6 +474,17 @@ config X86_X2APIC
If you don't know what to do here, say N.
+config X86_POSTED_MSI
+ bool "Enable MSI and MSI-x delivery by posted interrupts"
+ depends on X86_64 && IRQ_REMAP
+ help
+ This enables MSIs that are under interrupt remapping to be delivered as
+ posted interrupts to the host kernel. Interrupt throughput can
+ potentially be improved by coalescing CPU notifications during high
+ frequency bursts.
+
+ If you don't know what to do here, say N.
+
config X86_MPPARSE
bool "Enable MPS table" if ACPI
default y
@@ -501,12 +521,11 @@ config X86_FRED
When enabled, try to use Flexible Return and Event Delivery
instead of the legacy SYSCALL/SYSENTER/IDT architecture for
ring transitions and exception/interrupt handling if the
- system supports.
+ system supports it.
-if X86_32
config X86_BIGSMP
bool "Support for big SMP systems with more than 8 CPUs"
- depends on SMP
+ depends on SMP && X86_32
help
This option is needed for the systems that have more than 8 CPUs.
@@ -519,7 +538,10 @@ config X86_EXTENDED_PLATFORM
systems out there.)
If you enable this option then you'll be able to select support
- for the following (non-PC) 32 bit x86 platforms:
+ for the following non-PC x86 platforms, depending on the value of
+ CONFIG_64BIT.
+
+ 32-bit platforms (CONFIG_64BIT=n):
Goldfish (Android emulator)
AMD Elan
RDC R-321x SoC
@@ -527,28 +549,14 @@ config X86_EXTENDED_PLATFORM
STA2X11-based (e.g. Northville)
Moorestown MID devices
- If you have one of these systems, or if you want to build a
- generic distribution kernel, say Y here - otherwise say N.
-endif # X86_32
-
-if X86_64
-config X86_EXTENDED_PLATFORM
- bool "Support for extended (non-PC) x86 platforms"
- default y
- help
- If you disable this option then the kernel will only support
- standard PC platforms. (which covers the vast majority of
- systems out there.)
-
- If you enable this option then you'll be able to select support
- for the following (non-PC) 64 bit x86 platforms:
+ 64-bit platforms (CONFIG_64BIT=y):
Numascale NumaChip
ScaleMP vSMP
SGI Ultraviolet
If you have one of these systems, or if you want to build a
generic distribution kernel, say Y here - otherwise say N.
-endif # X86_64
+
# This is an alphabetically sorted list of 64 bit extended platforms
# Please maintain the alphabetic order if and when there are additions
config X86_NUMACHIP
@@ -940,7 +948,6 @@ config DMI
config GART_IOMMU
bool "Old AMD GART IOMMU support"
- select DMA_OPS
select IOMMU_HELPER
select SWIOTLB
depends on X86_64 && PCI && AMD_NB
@@ -1116,6 +1123,13 @@ config X86_LOCAL_APIC
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
select IRQ_DOMAIN_HIERARCHY
+config ACPI_MADT_WAKEUP
+ def_bool y
+ depends on X86_64
+ depends on ACPI
+ depends on SMP
+ depends on X86_LOCAL_APIC
+
config X86_IO_APIC
def_bool y
depends on X86_LOCAL_APIC || X86_UP_IOAPIC
@@ -1589,14 +1603,6 @@ config X86_64_ACPI_NUMA
help
Enable ACPI SRAT based node topology detection.
-config NUMA_EMU
- bool "NUMA emulation"
- depends on NUMA
- help
- Enable NUMA emulation. A flat machine will be split
- into virtual nodes when booted with "numa=fake=N", where N is the
- number of nodes. This is only useful for debugging.
-
config NODES_SHIFT
int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
range 1 10
@@ -1796,6 +1802,7 @@ config X86_PAT
def_bool y
prompt "x86 PAT support" if EXPERT
depends on MTRR
+ select ARCH_USES_PG_ARCH_2
help
Use PAT attributes to setup page level cache control.
@@ -1807,10 +1814,6 @@ config X86_PAT
If unsure, say Y.
-config ARCH_USES_PG_UNCACHED
- def_bool y
- depends on X86_PAT
-
config X86_UMIP
def_bool y
prompt "User Mode Instruction Prevention" if EXPERT
@@ -1879,6 +1882,10 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
If unsure, say y.
+config ARCH_PKEY_BITS
+ int
+ default 4
+
choice
prompt "TSX enable mode"
depends on CPU_SUP_INTEL
@@ -2036,26 +2043,6 @@ config EFI_MIXED
If unsure, say N.
-config EFI_FAKE_MEMMAP
- bool "Enable EFI fake memory map"
- depends on EFI
- help
- Saying Y here will enable "efi_fake_mem" boot option. By specifying
- this parameter, you can add arbitrary attribute to specific memory
- range by updating original (firmware provided) EFI memmap. This is
- useful for debugging of EFI memmap related feature, e.g., Address
- Range Mirroring feature.
-
-config EFI_MAX_FAKE_MEM
- int "maximum allowable number of ranges in efi_fake_mem boot option"
- depends on EFI_FAKE_MEMMAP
- range 1 128
- default 8
- help
- Maximum allowable number of ranges in efi_fake_mem boot option.
- Ranges can be set up to this value using comma-separated list.
- The default value is 8.
-
config EFI_RUNTIME_MAP
bool "Export EFI runtime maps to sysfs" if EXPERT
depends on EFI
@@ -2425,23 +2412,35 @@ config STRICT_SIGALTSTACK_SIZE
Say 'N' unless you want to really enforce this check.
+config CFI_AUTO_DEFAULT
+ bool "Attempt to use FineIBT by default at boot time"
+ depends on FINEIBT
+ default y
+ help
+ Attempt to use FineIBT by default at boot time. If enabled,
+ this is the same as booting with "cfi=auto". If disabled,
+ this is the same as booting with "cfi=kcfi".
+
source "kernel/livepatch/Kconfig"
endmenu
config CC_HAS_NAMED_AS
- def_bool CC_IS_GCC && GCC_VERSION >= 120100
+ def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC) -x c - -S -o /dev/null)
+ depends on CC_IS_GCC
+
+config CC_HAS_NAMED_AS_FIXED_SANITIZERS
+ def_bool CC_IS_GCC && GCC_VERSION >= 130300
config USE_X86_SEG_SUPPORT
def_bool y
depends on CC_HAS_NAMED_AS
#
- # -fsanitize=kernel-address (KASAN) is at the moment incompatible
- # with named address spaces - see GCC PR sanitizer/111736.
+ # -fsanitize=kernel-address (KASAN) and -fsanitize=thread
+ # (KCSAN) are incompatible with named address spaces with
+ # GCC < 13.3 - see GCC PR sanitizer/111736.
#
- depends on !KASAN
- # -fsanitize=thread (KCSAN) is also incompatible.
- depends on !KCSAN
+ depends on !(KASAN || KCSAN) || CC_HAS_NAMED_AS_FIXED_SANITIZERS
config CC_HAS_SLS
def_bool $(cc-option,-mharden-sls=all)
@@ -2608,24 +2607,15 @@ config MITIGATION_SLS
against straight line speculation. The kernel image might be slightly
larger.
-config MITIGATION_GDS_FORCE
- bool "Force GDS Mitigation"
+config MITIGATION_GDS
+ bool "Mitigate Gather Data Sampling"
depends on CPU_SUP_INTEL
- default n
+ default y
help
- Gather Data Sampling (GDS) is a hardware vulnerability which allows
- unprivileged speculative access to data which was previously stored in
- vector registers.
-
- This option is equivalent to setting gather_data_sampling=force on the
- command line. The microcode mitigation is used if present, otherwise
- AVX is disabled as a mitigation. On affected systems that are missing
- the microcode any userspace code that unconditionally uses AVX will
- break with this option set.
-
- Setting this option on systems not vulnerable to GDS has no effect.
-
- If in doubt, say N.
+ Enable mitigation for Gather Data Sampling (GDS). GDS is a hardware
+ vulnerability which allows unprivileged speculative access to data
+ which was previously stored in vector registers. The attacker uses gather
+ instructions to infer the stale vector register data.
config MITIGATION_RFDS
bool "RFDS Mitigation"
@@ -2648,6 +2638,107 @@ config MITIGATION_SPECTRE_BHI
indirect branches.
See <file:Documentation/admin-guide/hw-vuln/spectre.rst>
+config MITIGATION_MDS
+ bool "Mitigate Microarchitectural Data Sampling (MDS) hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for Microarchitectural Data Sampling (MDS). MDS is
+ a hardware vulnerability which allows unprivileged speculative access
+ to data which is available in various CPU internal buffers.
+ See also <file:Documentation/admin-guide/hw-vuln/mds.rst>
+
+config MITIGATION_TAA
+ bool "Mitigate TSX Asynchronous Abort (TAA) hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for TSX Asynchronous Abort (TAA). TAA is a hardware
+ vulnerability that allows unprivileged speculative access to data
+ which is available in various CPU internal buffers by using
+ asynchronous aborts within an Intel TSX transactional region.
+ See also <file:Documentation/admin-guide/hw-vuln/tsx_async_abort.rst>
+
+config MITIGATION_MMIO_STALE_DATA
+ bool "Mitigate MMIO Stale Data hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for MMIO Stale Data hardware bugs. Processor MMIO
+ Stale Data Vulnerabilities are a class of memory-mapped I/O (MMIO)
+ vulnerabilities that can expose data. The vulnerabilities require the
+ attacker to have access to MMIO.
+ See also
+ <file:Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst>
+
+config MITIGATION_L1TF
+ bool "Mitigate L1 Terminal Fault (L1TF) hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Mitigate L1 Terminal Fault (L1TF) hardware bug. L1 Terminal Fault is a
+ hardware vulnerability which allows unprivileged speculative access to data
+ available in the Level 1 Data Cache.
+ See <file:Documentation/admin-guide/hw-vuln/l1tf.rst
+
+config MITIGATION_RETBLEED
+ bool "Mitigate RETBleed hardware bug"
+ depends on (CPU_SUP_INTEL && MITIGATION_SPECTRE_V2) || MITIGATION_UNRET_ENTRY || MITIGATION_IBPB_ENTRY
+ default y
+ help
+ Enable mitigation for RETBleed (Arbitrary Speculative Code Execution
+ with Return Instructions) vulnerability. RETBleed is a speculative
+ execution attack which takes advantage of microarchitectural behavior
+ in many modern microprocessors, similar to Spectre v2. An
+ unprivileged attacker can use these flaws to bypass conventional
+ memory security restrictions to gain read access to privileged memory
+ that would otherwise be inaccessible.
+
+config MITIGATION_SPECTRE_V1
+ bool "Mitigate SPECTRE V1 hardware bug"
+ default y
+ help
+ Enable mitigation for Spectre V1 (Bounds Check Bypass). Spectre V1 is a
+ class of side channel attacks that takes advantage of speculative
+ execution that bypasses conditional branch instructions used for
+ memory access bounds check.
+ See also <file:Documentation/admin-guide/hw-vuln/spectre.rst>
+
+config MITIGATION_SPECTRE_V2
+ bool "Mitigate SPECTRE V2 hardware bug"
+ default y
+ help
+ Enable mitigation for Spectre V2 (Branch Target Injection). Spectre
+ V2 is a class of side channel attacks that takes advantage of
+ indirect branch predictors inside the processor. In Spectre variant 2
+ attacks, the attacker can steer speculative indirect branches in the
+ victim to gadget code by poisoning the branch target buffer of a CPU
+ used for predicting indirect branch addresses.
+ See also <file:Documentation/admin-guide/hw-vuln/spectre.rst>
+
+config MITIGATION_SRBDS
+ bool "Mitigate Special Register Buffer Data Sampling (SRBDS) hardware bug"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for Special Register Buffer Data Sampling (SRBDS).
+ SRBDS is a hardware vulnerability that allows Microarchitectural Data
+ Sampling (MDS) techniques to infer values returned from special
+ register accesses. An unprivileged user can extract values returned
+ from RDRAND and RDSEED executed on another core or sibling thread
+ using MDS techniques.
+ See also
+ <file:Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst>
+
+config MITIGATION_SSB
+ bool "Mitigate Speculative Store Bypass (SSB) hardware bug"
+ default y
+ help
+ Enable mitigation for Speculative Store Bypass (SSB). SSB is a
+ hardware security vulnerability and its exploitation takes advantage
+ of speculative execution in a similar way to the Meltdown and Spectre
+ security vulnerabilities.
+
endif
config ARCH_HAS_ADD_PAGES
@@ -2977,9 +3068,13 @@ config OLPC_XO15_SCI
- AC adapter status updates
- Battery status updates
+config GEODE_COMMON
+ bool
+
config ALIX
bool "PCEngines ALIX System Support (LED setup)"
select GPIOLIB
+ select GEODE_COMMON
help
This option enables system support for the PCEngines ALIX.
At present this just sets up LEDs for GPIO control on
@@ -2994,12 +3089,14 @@ config ALIX
config NET5501
bool "Soekris Engineering net5501 System Support (LEDS, GPIO, etc)"
select GPIOLIB
+ select GEODE_COMMON
help
This option enables system support for the Soekris Engineering net5501.
config GEOS
bool "Traverse Technologies GEOS System Support (LEDS, GPIO, etc)"
select GPIOLIB
+ select GEODE_COMMON
depends on DMI
help
This option enables system support for the Traverse Technologies GEOS.
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
index 8ad41da301e5..6d20a6ce0507 100644
--- a/arch/x86/Kconfig.assembler
+++ b/arch/x86/Kconfig.assembler
@@ -25,7 +25,17 @@ config AS_GFNI
help
Supported by binutils >= 2.30 and LLVM integrated assembler
+config AS_VAES
+ def_bool $(as-instr,vaesenc %ymm0$(comma)%ymm1$(comma)%ymm2)
+ help
+ Supported by binutils >= 2.30 and LLVM integrated assembler
+
+config AS_VPCLMULQDQ
+ def_bool $(as-instr,vpclmulqdq \$0x10$(comma)%ymm0$(comma)%ymm1$(comma)%ymm2)
+ help
+ Supported by binutils >= 2.30 and LLVM integrated assembler
+
config AS_WRUSS
- def_bool $(as-instr,wrussq %rax$(comma)(%rbx))
+ def_bool $(as-instr64,wrussq %rax$(comma)(%rbx))
help
Supported by binutils >= 2.31 and LLVM integrated assembler
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index c5d614d28a75..74777a97e394 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -248,6 +248,7 @@ config UNWINDER_ORC
config UNWINDER_FRAME_POINTER
bool "Frame pointer unwinder"
+ select ARCH_WANT_FRAME_POINTERS
select FRAME_POINTER
help
This option enables the frame pointer unwinder for unwinding kernel
@@ -271,7 +272,3 @@ config UNWINDER_GUESS
overhead.
endchoice
-
-config FRAME_POINTER
- depends on !UNWINDER_ORC && !UNWINDER_GUESS
- bool
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5ab93fcdd691..cd75e78a06c1 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -24,11 +24,15 @@ RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix)
ifdef CONFIG_MITIGATION_RETHUNK
RETHUNK_CFLAGS := -mfunction-return=thunk-extern
+RETHUNK_RUSTFLAGS := -Zfunction-return=thunk-extern
RETPOLINE_CFLAGS += $(RETHUNK_CFLAGS)
+RETPOLINE_RUSTFLAGS += $(RETHUNK_RUSTFLAGS)
endif
export RETHUNK_CFLAGS
+export RETHUNK_RUSTFLAGS
export RETPOLINE_CFLAGS
+export RETPOLINE_RUSTFLAGS
export RETPOLINE_VDSO_CFLAGS
# For gcc stack alignment is specified with -mpreferred-stack-boundary,
@@ -74,6 +78,26 @@ KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
+#
+# CFLAGS for compiling floating point code inside the kernel.
+#
+CC_FLAGS_FPU := -msse -msse2
+ifdef CONFIG_CC_IS_GCC
+# Stack alignment mismatch, proceed with caution.
+# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3
+# (8B stack alignment).
+# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+#
+# The "-msse" in the first argument is there so that the
+# -mpreferred-stack-boundary=3 build error:
+#
+# -mpreferred-stack-boundary=3 is not between 4 and 12
+#
+# can be triggered. Otherwise gcc doesn't complain.
+CC_FLAGS_FPU += -mhard-float
+CC_FLAGS_FPU += $(call cc-option,-msse -mpreferred-stack-boundary=3,-mpreferred-stack-boundary=4)
+endif
+
ifeq ($(CONFIG_X86_KERNEL_IBT),y)
#
# Kernel IBT has S_CET.NOTRACK_EN=0, as such the compilers must not generate
@@ -198,9 +222,10 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
# Avoid indirect branches in kernel to deal with Spectre
ifdef CONFIG_MITIGATION_RETPOLINE
KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
+ KBUILD_RUSTFLAGS += $(RETPOLINE_RUSTFLAGS)
# Additionally, avoid generating expensive indirect jumps which
# are subject to retpolines for small number of switch cases.
- # clang turns off jump table generation by default when under
+ # LLVM turns off jump table generation by default when under
# retpoline builds, however, gcc does not for x86. This has
# only been fixed starting from gcc stable version 8.4.0 and
# onwards, but not for older ones. See gcc bug #86952.
@@ -217,6 +242,10 @@ ifdef CONFIG_CALL_PADDING
PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES)
KBUILD_CFLAGS += $(PADDING_CFLAGS)
export PADDING_CFLAGS
+
+PADDING_RUSTFLAGS := -Zpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES)
+KBUILD_RUSTFLAGS += $(PADDING_RUSTFLAGS)
+export PADDING_RUSTFLAGS
endif
KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
@@ -258,7 +287,7 @@ drivers-$(CONFIG_PCI) += arch/x86/pci/
# suspend and hibernation support
drivers-$(CONFIG_PM) += arch/x86/power/
-drivers-$(CONFIG_FB_CORE) += arch/x86/video/
+drivers-$(CONFIG_VIDEO) += arch/x86/video/
####
# boot loader support. Several targets are kept for legacy purposes
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 2106a2bd152b..a46b1397ad01 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -9,6 +9,7 @@ core-y += arch/x86/crypto/
#
ifeq ($(CONFIG_CC_IS_CLANG),y)
KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
endif
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 3cece19b7473..9cc0ff6e9067 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,19 +9,6 @@
# Changed by many, many contributors over the years.
#
-# Sanitizer runtimes are unavailable and cannot be linked for early boot code.
-KASAN_SANITIZE := n
-KCSAN_SANITIZE := n
-KMSAN_SANITIZE := n
-OBJECT_FILES_NON_STANDARD := y
-
-# Kernel does not boot with kcov instrumentation here.
-# One of the problems observed was insertion of __sanitizer_cov_trace_pc()
-# callback into middle of per-cpu data enabling code. Thus the callback observed
-# inconsistent state and crashed. We are interested mostly in syscall coverage,
-# so boot code is not interesting anyway.
-KCOV_INSTRUMENT := n
-
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
@@ -69,8 +56,7 @@ KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
-GCOV_PROFILE := n
-UBSAN_SANITIZE := n
+KBUILD_CFLAGS += $(CONFIG_CC_IMPLICIT_FALLTHROUGH)
$(obj)/bzImage: asflags-y := $(SVGA_MODE)
@@ -129,7 +115,7 @@ targets += mtools.conf
# genimage.sh requires bash, but it also has a bunch of other
# external dependencies.
quiet_cmd_genimage = GENIMAGE $3
-cmd_genimage = $(BASH) $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \
+ cmd_genimage = $(BASH) $(src)/genimage.sh $2 $3 $(obj)/bzImage \
$(obj)/mtools.conf '$(FDARGS)' $(FDINITRD)
PHONY += bzdisk fdimage fdimage144 fdimage288 hdimage isoimage
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index e9522c6893be..f2051644de94 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -17,15 +17,6 @@
# (see scripts/Makefile.lib size_append)
# compressed vmlinux.bin.all + u32 size of vmlinux.bin.all
-# Sanitizer runtimes are unavailable and cannot be linked for early boot code.
-KASAN_SANITIZE := n
-KCSAN_SANITIZE := n
-KMSAN_SANITIZE := n
-OBJECT_FILES_NON_STANDARD := y
-
-# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
-KCOV_INSTRUMENT := n
-
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst
@@ -59,8 +50,6 @@ KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-GCOV_PROFILE := n
-UBSAN_SANITIZE :=n
KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info)
@@ -116,9 +105,9 @@ vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o
vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_mixed.o
-vmlinux-objs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
+vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
-$(obj)/vmlinux: $(vmlinux-objs-y) FORCE
+$(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE
$(call if_changed,ld)
OBJCOPYFLAGS_vmlinux.bin := -R .comment -S
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index bf4a10a5794f..1dcb794c5479 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -398,6 +398,11 @@ SYM_CODE_START(startup_64)
call sev_enable
#endif
+ /* Preserve only the CR4 bits that must be preserved, and clear the rest */
+ movq %cr4, %rax
+ andl $(X86_CR4_PAE | X86_CR4_MCE | X86_CR4_LA57), %eax
+ movq %rax, %cr4
+
/*
* configure_5level_paging() updates the number of paging levels using
* a trampoline in 32-bit addressable memory if the current number does
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index dec961c6d16a..f4d82379bf44 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -119,13 +119,8 @@ char *skip_spaces(const char *str)
#include "../../../../lib/ctype.c"
#include "../../../../lib/cmdline.c"
-enum parse_mode {
- PARSE_MEMMAP,
- PARSE_EFI,
-};
-
static int
-parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode)
+parse_memmap(char *p, u64 *start, u64 *size)
{
char *oldp;
@@ -148,29 +143,11 @@ parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode)
*start = memparse(p + 1, &p);
return 0;
case '@':
- if (mode == PARSE_MEMMAP) {
- /*
- * memmap=nn@ss specifies usable region, should
- * be skipped
- */
- *size = 0;
- } else {
- u64 flags;
-
- /*
- * efi_fake_mem=nn@ss:attr the attr specifies
- * flags that might imply a soft-reservation.
- */
- *start = memparse(p + 1, &p);
- if (p && *p == ':') {
- p++;
- if (kstrtoull(p, 0, &flags) < 0)
- *size = 0;
- else if (flags & EFI_MEMORY_SP)
- return 0;
- }
- *size = 0;
- }
+ /*
+ * memmap=nn@ss specifies usable region, should
+ * be skipped
+ */
+ *size = 0;
fallthrough;
default:
/*
@@ -185,7 +162,7 @@ parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode)
return -EINVAL;
}
-static void mem_avoid_memmap(enum parse_mode mode, char *str)
+static void mem_avoid_memmap(char *str)
{
static int i;
@@ -200,7 +177,7 @@ static void mem_avoid_memmap(enum parse_mode mode, char *str)
if (k)
*k++ = 0;
- rc = parse_memmap(str, &start, &size, mode);
+ rc = parse_memmap(str, &start, &size);
if (rc < 0)
break;
str = k;
@@ -281,7 +258,7 @@ static void handle_mem_options(void)
break;
if (!strcmp(param, "memmap")) {
- mem_avoid_memmap(PARSE_MEMMAP, val);
+ mem_avoid_memmap(val);
} else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) {
parse_gb_huge_pages(param, val);
} else if (!strcmp(param, "mem")) {
@@ -295,8 +272,6 @@ static void handle_mem_options(void)
if (mem_size < mem_limit)
mem_limit = mem_size;
- } else if (!strcmp(param, "efi_fake_mem")) {
- mem_avoid_memmap(PARSE_EFI, val);
}
}
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b70e4a21c15f..04a35b2c26e9 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -511,7 +511,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
if (init_unaccepted_memory()) {
debug_putstr("Accepting memory... ");
- accept_memory(__pa(output), __pa(output) + needed_size);
+ accept_memory(__pa(output), needed_size);
}
entry_offset = decompress_kernel(output, virt_addr, error);
@@ -531,8 +531,3 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
return output + entry_offset;
}
-
-void __fortify_panic(const u8 reason, size_t avail, size_t size)
-{
- error("detected buffer overflow");
-}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index b353a7be380c..dd8d1a85f671 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -256,6 +256,6 @@ static inline bool init_unaccepted_memory(void) { return false; }
/* Defined in EFI stub */
extern struct efi_unaccepted_memory *unaccepted_table;
-void accept_memory(phys_addr_t start, phys_addr_t end);
+void accept_memory(phys_addr_t start, unsigned long size);
#endif /* BOOT_COMPRESSED_MISC_H */
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index ec71846d28c9..cd44e120fe53 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -127,7 +127,35 @@ static bool fault_in_kernel_space(unsigned long address)
#include "../../lib/insn.c"
/* Include code for early handlers */
-#include "../../kernel/sev-shared.c"
+#include "../../coco/sev/shared.c"
+
+static struct svsm_ca *svsm_get_caa(void)
+{
+ return boot_svsm_caa;
+}
+
+static u64 svsm_get_caa_pa(void)
+{
+ return boot_svsm_caa_pa;
+}
+
+static int svsm_perform_call_protocol(struct svsm_call *call)
+{
+ struct ghcb *ghcb;
+ int ret;
+
+ if (boot_ghcb)
+ ghcb = boot_ghcb;
+ else
+ ghcb = NULL;
+
+ do {
+ ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call)
+ : svsm_perform_msr_protocol(call);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
bool sev_snp_enabled(void)
{
@@ -145,8 +173,8 @@ static void __page_state_change(unsigned long paddr, enum psc_op op)
* If private -> shared then invalidate the page before requesting the
* state change in the RMP table.
*/
- if (op == SNP_PAGE_STATE_SHARED && pvalidate(paddr, RMP_PG_SIZE_4K, 0))
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+ if (op == SNP_PAGE_STATE_SHARED)
+ pvalidate_4k_page(paddr, paddr, false);
/* Issue VMGEXIT to change the page state in RMP table. */
sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
@@ -161,8 +189,8 @@ static void __page_state_change(unsigned long paddr, enum psc_op op)
* Now that page state is changed in the RMP table, validate it so that it is
* consistent with the RMP entry.
*/
- if (op == SNP_PAGE_STATE_PRIVATE && pvalidate(paddr, RMP_PG_SIZE_4K, 1))
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+ if (op == SNP_PAGE_STATE_PRIVATE)
+ pvalidate_4k_page(paddr, paddr, true);
}
void snp_set_page_private(unsigned long paddr)
@@ -256,6 +284,16 @@ void sev_es_shutdown_ghcb(void)
error("SEV-ES CPU Features missing.");
/*
+ * This denotes whether to use the GHCB MSR protocol or the GHCB
+ * shared page to perform a GHCB request. Since the GHCB page is
+ * being changed to encrypted, it can't be used to perform GHCB
+ * requests. Clear the boot_ghcb variable so that the GHCB MSR
+ * protocol is used to change the GHCB page over to an encrypted
+ * page.
+ */
+ boot_ghcb = NULL;
+
+ /*
* GHCB Page must be flushed from the cache and mapped encrypted again.
* Otherwise the running kernel will see strange cache effects when
* trying to use that page.
@@ -335,26 +373,6 @@ finish:
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
}
-static void enforce_vmpl0(void)
-{
- u64 attrs;
- int err;
-
- /*
- * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically
- * higher) privilege level. Here, clear the VMPL1 permission mask of the
- * GHCB page. If the guest is not running at VMPL0, this will fail.
- *
- * If the guest is running at VMPL0, it will succeed. Even if that operation
- * modifies permission bits, it is still ok to do so currently because Linux
- * SNP guests are supported only on VMPL0 so VMPL1 or higher permission masks
- * changing is a don't-care.
- */
- attrs = 1;
- if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, attrs))
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
-}
-
/*
* SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need
* guest side implementation for proper functioning of the guest. If any
@@ -413,6 +431,92 @@ void snp_check_features(void)
}
}
+/* Search for Confidential Computing blob in the EFI config table. */
+static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp)
+{
+ unsigned long cfg_table_pa;
+ unsigned int cfg_table_len;
+ int ret;
+
+ ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len);
+ if (ret)
+ return NULL;
+
+ return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa,
+ cfg_table_len,
+ EFI_CC_BLOB_GUID);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the boot kernel
+ * by firmware/bootloader in the following ways:
+ *
+ * - via an entry in the EFI config table
+ * - via a setup_data structure, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ cc_info = find_cc_blob_efi(bp);
+ if (cc_info)
+ goto found_cc_info;
+
+ cc_info = find_cc_blob_setup_data(bp);
+ if (!cc_info)
+ return NULL;
+
+found_cc_info:
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ return cc_info;
+}
+
+/*
+ * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks
+ * will verify the SNP CPUID/MSR bits.
+ */
+static bool early_snp_init(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ if (!bp)
+ return false;
+
+ cc_info = find_cc_blob(bp);
+ if (!cc_info)
+ return false;
+
+ /*
+ * If a SNP-specific Confidential Computing blob is present, then
+ * firmware/bootloader have indicated SNP support. Verifying this
+ * involves CPUID checks which will be more reliable if the SNP
+ * CPUID table is used. See comments over snp_setup_cpuid_table() for
+ * more details.
+ */
+ setup_cpuid_table(cc_info);
+
+ /*
+ * Record the SVSM Calling Area (CA) address if the guest is not
+ * running at VMPL0. The CA will be used to communicate with the
+ * SVSM and request its services.
+ */
+ svsm_setup_ca(cc_info);
+
+ /*
+ * Pass run-time kernel a pointer to CC info via boot_params so EFI
+ * config table doesn't need to be searched again during early startup
+ * phase.
+ */
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+ return true;
+}
+
/*
* sev_check_cpu_support - Check for SEV support in the CPU capabilities
*
@@ -463,7 +567,7 @@ void sev_enable(struct boot_params *bp)
bp->cc_blob_address = 0;
/*
- * Do an initial SEV capability check before snp_init() which
+ * Do an initial SEV capability check before early_snp_init() which
* loads the CPUID page and the same checks afterwards are done
* without the hypervisor and are trustworthy.
*
@@ -478,7 +582,7 @@ void sev_enable(struct boot_params *bp)
* Setup/preliminary detection of SNP. This will be sanity-checked
* against CPUID/MSR values later.
*/
- snp = snp_init(bp);
+ snp = early_snp_init(bp);
/* Now repeat the checks with the SNP CPUID table. */
@@ -506,10 +610,32 @@ void sev_enable(struct boot_params *bp)
* features.
*/
if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) {
- if (!(get_hv_features() & GHCB_HV_FT_SNP))
+ u64 hv_features;
+ int ret;
+
+ hv_features = get_hv_features();
+ if (!(hv_features & GHCB_HV_FT_SNP))
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
- enforce_vmpl0();
+ /*
+ * Enforce running at VMPL0 or with an SVSM.
+ *
+ * Use RMPADJUST (see the rmpadjust() function for a description of
+ * what the instruction does) to update the VMPL1 permissions of a
+ * page. If the guest is running at VMPL0, this will succeed. If the
+ * guest is running at any other VMPL, this will fail. Linux SNP guests
+ * only ever run at a single VMPL level so permission mask changes of a
+ * lesser-privileged VMPL are a don't-care.
+ */
+ ret = rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, 1);
+
+ /*
+ * Running at VMPL0 is not required if an SVSM is present and the hypervisor
+ * supports the required SVSM GHCB events.
+ */
+ if (ret &&
+ !(snp_vmpl && (hv_features & GHCB_HV_FT_SNP_MULTI_VMPL)))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
}
if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
@@ -535,85 +661,6 @@ u64 sev_get_status(void)
return m.q;
}
-/* Search for Confidential Computing blob in the EFI config table. */
-static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp)
-{
- unsigned long cfg_table_pa;
- unsigned int cfg_table_len;
- int ret;
-
- ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len);
- if (ret)
- return NULL;
-
- return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa,
- cfg_table_len,
- EFI_CC_BLOB_GUID);
-}
-
-/*
- * Initial set up of SNP relies on information provided by the
- * Confidential Computing blob, which can be passed to the boot kernel
- * by firmware/bootloader in the following ways:
- *
- * - via an entry in the EFI config table
- * - via a setup_data structure, as defined by the Linux Boot Protocol
- *
- * Scan for the blob in that order.
- */
-static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
-{
- struct cc_blob_sev_info *cc_info;
-
- cc_info = find_cc_blob_efi(bp);
- if (cc_info)
- goto found_cc_info;
-
- cc_info = find_cc_blob_setup_data(bp);
- if (!cc_info)
- return NULL;
-
-found_cc_info:
- if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
-
- return cc_info;
-}
-
-/*
- * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks
- * will verify the SNP CPUID/MSR bits.
- */
-bool snp_init(struct boot_params *bp)
-{
- struct cc_blob_sev_info *cc_info;
-
- if (!bp)
- return false;
-
- cc_info = find_cc_blob(bp);
- if (!cc_info)
- return false;
-
- /*
- * If a SNP-specific Confidential Computing blob is present, then
- * firmware/bootloader have indicated SNP support. Verifying this
- * involves CPUID checks which will be more reliable if the SNP
- * CPUID table is used. See comments over snp_setup_cpuid_table() for
- * more details.
- */
- setup_cpuid_table(cc_info);
-
- /*
- * Pass run-time kernel a pointer to CC info via boot_params so EFI
- * config table doesn't need to be searched again during early startup
- * phase.
- */
- bp->cc_blob_address = (u32)(unsigned long)cc_info;
-
- return true;
-}
-
void sev_prep_identity_maps(unsigned long top_level_pgt)
{
/*
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index fed8d13ce252..0aae4d4ed615 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -203,7 +203,7 @@ int check_knl_erratum(void)
*/
if (!is_intel() ||
cpu.family != 6 ||
- cpu.model != INTEL_FAM6_XEON_PHI_KNL)
+ cpu.model != 0x57 /*INTEL_XEON_PHI_KNL*/)
return 0;
/*
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
index 0849f4b42745..93784abcd66d 100755
--- a/arch/x86/boot/install.sh
+++ b/arch/x86/boot/install.sh
@@ -16,6 +16,8 @@
# $3 - kernel map file
# $4 - default install path (blank if root directory)
+set -e
+
if [ -f $4/vmlinuz ]; then
mv $4/vmlinuz $4/vmlinuz.old
fi
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index c4ea5258ab55..9d0fea18d3c8 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -27,34 +27,32 @@ char *heap_end = _end; /* Default end of heap = no heap */
* screws up the old-style command line protocol, adjust by
* filling in the new-style command line pointer instead.
*/
-
static void copy_boot_params(void)
{
struct old_cmdline {
u16 cl_magic;
u16 cl_offset;
};
- const struct old_cmdline * const oldcmd =
- absolute_pointer(OLD_CL_ADDRESS);
+ const struct old_cmdline * const oldcmd = absolute_pointer(OLD_CL_ADDRESS);
BUILD_BUG_ON(sizeof(boot_params) != 4096);
memcpy(&boot_params.hdr, &hdr, sizeof(hdr));
- if (!boot_params.hdr.cmd_line_ptr &&
- oldcmd->cl_magic == OLD_CL_MAGIC) {
- /* Old-style command line protocol. */
+ if (!boot_params.hdr.cmd_line_ptr && oldcmd->cl_magic == OLD_CL_MAGIC) {
+ /* Old-style command line protocol */
u16 cmdline_seg;
- /* Figure out if the command line falls in the region
- of memory that an old kernel would have copied up
- to 0x90000... */
+ /*
+ * Figure out if the command line falls in the region
+ * of memory that an old kernel would have copied up
+ * to 0x90000...
+ */
if (oldcmd->cl_offset < boot_params.hdr.setup_move_size)
cmdline_seg = ds();
else
cmdline_seg = 0x9000;
- boot_params.hdr.cmd_line_ptr =
- (cmdline_seg << 4) + oldcmd->cl_offset;
+ boot_params.hdr.cmd_line_ptr = (cmdline_seg << 4) + oldcmd->cl_offset;
}
}
@@ -66,6 +64,7 @@ static void copy_boot_params(void)
static void keyboard_init(void)
{
struct biosregs ireg, oreg;
+
initregs(&ireg);
ireg.ah = 0x02; /* Get keyboard status */
@@ -83,8 +82,10 @@ static void query_ist(void)
{
struct biosregs ireg, oreg;
- /* Some older BIOSes apparently crash on this call, so filter
- it from machines too old to have SpeedStep at all. */
+ /*
+ * Some older BIOSes apparently crash on this call, so filter
+ * it from machines too old to have SpeedStep at all.
+ */
if (cpu.level < 6)
return;
@@ -119,17 +120,13 @@ static void init_heap(void)
char *stack_end;
if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
- asm("leal %P1(%%esp),%0"
- : "=r" (stack_end) : "i" (-STACK_SIZE));
-
- heap_end = (char *)
- ((size_t)boot_params.hdr.heap_end_ptr + 0x200);
+ stack_end = (char *) (current_stack_pointer - STACK_SIZE);
+ heap_end = (char *) ((size_t)boot_params.hdr.heap_end_ptr + 0x200);
if (heap_end > stack_end)
heap_end = stack_end;
} else {
/* Boot protocol 2.00 only, no heap available */
- puts("WARNING: Ancient bootloader, some functionality "
- "may be limited!\n");
+ puts("WARNING: Ancient bootloader, some functionality may be limited!\n");
}
}
@@ -150,12 +147,11 @@ void main(void)
/* Make sure we have all the proper CPU support */
if (validate_cpu()) {
- puts("Unable to boot - please use a kernel appropriate "
- "for your CPU.\n");
+ puts("Unable to boot - please use a kernel appropriate for your CPU.\n");
die();
}
- /* Tell the BIOS what CPU mode we intend to run in. */
+ /* Tell the BIOS what CPU mode we intend to run in */
set_bios_mode();
/* Detect memory layout */
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index 1237beeb9540..51dc14b714f6 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -246,6 +246,7 @@ int vsprintf(char *buf, const char *fmt, va_list args)
case 'x':
flags |= SMALL;
+ fallthrough;
case 'X':
base = 16;
break;
@@ -253,6 +254,8 @@ int vsprintf(char *buf, const char *fmt, va_list args)
case 'd':
case 'i':
flags |= SIGN;
+ break;
+
case 'u':
break;
diff --git a/arch/x86/coco/Makefile b/arch/x86/coco/Makefile
index c816acf78b6a..eabdc7486538 100644
--- a/arch/x86/coco/Makefile
+++ b/arch/x86/coco/Makefile
@@ -6,3 +6,4 @@ CFLAGS_core.o += -fno-stack-protector
obj-y += core.o
obj-$(CONFIG_INTEL_TDX_GUEST) += tdx/
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev/
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index b31ef2424d19..0f81f70aca82 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -29,7 +29,6 @@ static bool noinstr intel_cc_platform_has(enum cc_attr attr)
{
switch (attr) {
case CC_ATTR_GUEST_UNROLL_STRING_IO:
- case CC_ATTR_HOTPLUG_DISABLED:
case CC_ATTR_GUEST_MEM_ENCRYPT:
case CC_ATTR_MEM_ENCRYPT:
return true;
diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile
new file mode 100644
index 000000000000..4e375e7305ac
--- /dev/null
+++ b/arch/x86/coco/sev/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += core.o
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_core.o = -pg
+endif
+
+KASAN_SANITIZE_core.o := n
+KMSAN_SANITIZE_core.o := n
+KCOV_INSTRUMENT_core.o := n
+
+# With some compiler versions the generated code results in boot hangs, caused
+# by several compilation units. To be safe, disable all instrumentation.
+KCSAN_SANITIZE := n
diff --git a/arch/x86/kernel/sev.c b/arch/x86/coco/sev/core.c
index 38ad066179d8..de1df0cb45da 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/coco/sev/core.c
@@ -133,16 +133,20 @@ struct ghcb_state {
struct ghcb *ghcb;
};
+/* For early boot SVSM communication */
+static struct svsm_ca boot_svsm_ca_page __aligned(PAGE_SIZE);
+
static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
+static DEFINE_PER_CPU(struct svsm_ca *, svsm_caa);
+static DEFINE_PER_CPU(u64, svsm_caa_pa);
struct sev_config {
__u64 debug : 1,
/*
- * A flag used by __set_pages_state() that indicates when the
- * per-CPU GHCB has been created and registered and thus can be
- * used by the BSP instead of the early boot GHCB.
+ * Indicates when the per-CPU GHCB has been created and registered
+ * and thus can be used by the BSP instead of the early boot GHCB.
*
* For APs, the per-CPU GHCB is created before they are started
* and registered upon startup, so this flag can be used globally
@@ -150,7 +154,16 @@ struct sev_config {
*/
ghcbs_initialized : 1,
- __reserved : 62;
+ /*
+ * Indicates when the per-CPU SVSM CA is to be used instead of the
+ * boot SVSM CA.
+ *
+ * For APs, the per-CPU SVSM CA is created as part of the AP
+ * bringup, so this flag can be used globally for the BSP and APs.
+ */
+ use_cas : 1,
+
+ __reserved : 61;
};
static struct sev_config sev_cfg __read_mostly;
@@ -572,8 +585,61 @@ fault:
return ES_EXCEPTION;
}
+static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
+{
+ long error_code = ctxt->fi.error_code;
+ int trapnr = ctxt->fi.vector;
+
+ ctxt->regs->orig_ax = ctxt->fi.error_code;
+
+ switch (trapnr) {
+ case X86_TRAP_GP:
+ exc_general_protection(ctxt->regs, error_code);
+ break;
+ case X86_TRAP_UD:
+ exc_invalid_op(ctxt->regs);
+ break;
+ case X86_TRAP_PF:
+ write_cr2(ctxt->fi.cr2);
+ exc_page_fault(ctxt->regs, error_code);
+ break;
+ case X86_TRAP_AC:
+ exc_alignment_check(ctxt->regs, error_code);
+ break;
+ default:
+ pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
+ BUG();
+ }
+}
+
/* Include code shared with pre-decompression boot stage */
-#include "sev-shared.c"
+#include "shared.c"
+
+static inline struct svsm_ca *svsm_get_caa(void)
+{
+ /*
+ * Use rIP-relative references when called early in the boot. If
+ * ->use_cas is set, then it is late in the boot and no need
+ * to worry about rIP-relative references.
+ */
+ if (RIP_REL_REF(sev_cfg).use_cas)
+ return this_cpu_read(svsm_caa);
+ else
+ return RIP_REL_REF(boot_svsm_caa);
+}
+
+static u64 svsm_get_caa_pa(void)
+{
+ /*
+ * Use rIP-relative references when called early in the boot. If
+ * ->use_cas is set, then it is late in the boot and no need
+ * to worry about rIP-relative references.
+ */
+ if (RIP_REL_REF(sev_cfg).use_cas)
+ return this_cpu_read(svsm_caa_pa);
+ else
+ return RIP_REL_REF(boot_svsm_caa_pa);
+}
static noinstr void __sev_put_ghcb(struct ghcb_state *state)
{
@@ -600,6 +666,44 @@ static noinstr void __sev_put_ghcb(struct ghcb_state *state)
}
}
+static int svsm_perform_call_protocol(struct svsm_call *call)
+{
+ struct ghcb_state state;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ int ret;
+
+ /*
+ * This can be called very early in the boot, use native functions in
+ * order to avoid paravirt issues.
+ */
+ flags = native_local_irq_save();
+
+ /*
+ * Use rip-relative references when called early in the boot. If
+ * ghcbs_initialized is set, then it is late in the boot and no need
+ * to worry about rip-relative references in called functions.
+ */
+ if (RIP_REL_REF(sev_cfg).ghcbs_initialized)
+ ghcb = __sev_get_ghcb(&state);
+ else if (RIP_REL_REF(boot_ghcb))
+ ghcb = RIP_REL_REF(boot_ghcb);
+ else
+ ghcb = NULL;
+
+ do {
+ ret = ghcb ? svsm_perform_ghcb_protocol(ghcb, call)
+ : svsm_perform_msr_protocol(call);
+ } while (ret == -EAGAIN);
+
+ if (RIP_REL_REF(sev_cfg).ghcbs_initialized)
+ __sev_put_ghcb(&state);
+
+ native_local_irq_restore(flags);
+
+ return ret;
+}
+
void noinstr __sev_es_nmi_complete(void)
{
struct ghcb_state state;
@@ -648,7 +752,7 @@ static u64 __init get_secrets_page(void)
static u64 __init get_snp_jump_table_addr(void)
{
- struct snp_secrets_page_layout *layout;
+ struct snp_secrets_page *secrets;
void __iomem *mem;
u64 pa, addr;
@@ -662,9 +766,9 @@ static u64 __init get_snp_jump_table_addr(void)
return 0;
}
- layout = (__force struct snp_secrets_page_layout *)mem;
+ secrets = (__force struct snp_secrets_page *)mem;
- addr = layout->os_area.ap_jump_table_pa;
+ addr = secrets->os_area.ap_jump_table_pa;
iounmap(mem);
return addr;
@@ -709,7 +813,6 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr,
{
unsigned long paddr_end;
u64 val;
- int ret;
vaddr = vaddr & PAGE_MASK;
@@ -717,12 +820,9 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr,
paddr_end = paddr + (npages << PAGE_SHIFT);
while (paddr < paddr_end) {
- if (op == SNP_PAGE_STATE_SHARED) {
- /* Page validation must be rescinded before changing to shared */
- ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false);
- if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
- goto e_term;
- }
+ /* Page validation must be rescinded before changing to shared */
+ if (op == SNP_PAGE_STATE_SHARED)
+ pvalidate_4k_page(vaddr, paddr, false);
/*
* Use the MSR protocol because this function can be called before
@@ -744,12 +844,9 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr,
paddr, GHCB_MSR_PSC_RESP_VAL(val)))
goto e_term;
- if (op == SNP_PAGE_STATE_PRIVATE) {
- /* Page validation must be performed after changing to private */
- ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true);
- if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
- goto e_term;
- }
+ /* Page validation must be performed after changing to private */
+ if (op == SNP_PAGE_STATE_PRIVATE)
+ pvalidate_4k_page(vaddr, paddr, true);
vaddr += PAGE_SIZE;
paddr += PAGE_SIZE;
@@ -913,22 +1010,49 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
}
-static int snp_set_vmsa(void *va, bool vmsa)
+static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa)
{
- u64 attrs;
+ int ret;
- /*
- * Running at VMPL0 allows the kernel to change the VMSA bit for a page
- * using the RMPADJUST instruction. However, for the instruction to
- * succeed it must target the permissions of a lesser privileged
- * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
- * instruction in the AMD64 APM Volume 3).
- */
- attrs = 1;
- if (vmsa)
- attrs |= RMPADJUST_VMSA_PAGE_BIT;
+ if (snp_vmpl) {
+ struct svsm_call call = {};
+ unsigned long flags;
+
+ local_irq_save(flags);
- return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+ call.caa = this_cpu_read(svsm_caa);
+ call.rcx = __pa(va);
+
+ if (make_vmsa) {
+ /* Protocol 0, Call ID 2 */
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU);
+ call.rdx = __pa(caa);
+ call.r8 = apic_id;
+ } else {
+ /* Protocol 0, Call ID 3 */
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU);
+ }
+
+ ret = svsm_perform_call_protocol(&call);
+
+ local_irq_restore(flags);
+ } else {
+ /*
+ * If the kernel runs at VMPL0, it can change the VMSA
+ * bit for a page using the RMPADJUST instruction.
+ * However, for the instruction to succeed it must
+ * target the permissions of a lesser privileged (higher
+ * numbered) VMPL level, so use VMPL1.
+ */
+ u64 attrs = 1;
+
+ if (make_vmsa)
+ attrs |= RMPADJUST_VMSA_PAGE_BIT;
+
+ ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+ }
+
+ return ret;
}
#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK)
@@ -938,7 +1062,7 @@ static int snp_set_vmsa(void *va, bool vmsa)
#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2)
#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3)
-static void *snp_alloc_vmsa_page(void)
+static void *snp_alloc_vmsa_page(int cpu)
{
struct page *p;
@@ -950,7 +1074,7 @@ static void *snp_alloc_vmsa_page(void)
*
* Allocate an 8k page which is also 8k-aligned.
*/
- p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+ p = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
if (!p)
return NULL;
@@ -962,11 +1086,11 @@ static void *snp_alloc_vmsa_page(void)
return page_address(p + 1);
}
-static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
+static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id)
{
int err;
- err = snp_set_vmsa(vmsa, false);
+ err = snp_set_vmsa(vmsa, NULL, apic_id, false);
if (err)
pr_err("clear VMSA page failed (%u), leaking page\n", err);
else
@@ -977,6 +1101,7 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
{
struct sev_es_save_area *cur_vmsa, *vmsa;
struct ghcb_state state;
+ struct svsm_ca *caa;
unsigned long flags;
struct ghcb *ghcb;
u8 sipi_vector;
@@ -1019,10 +1144,13 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
* #VMEXIT of that vCPU would wipe out all of the settings being done
* here.
*/
- vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page();
+ vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(cpu);
if (!vmsa)
return -ENOMEM;
+ /* If an SVSM is present, the SVSM per-CPU CAA will be !NULL */
+ caa = per_cpu(svsm_caa, cpu);
+
/* CR4 should maintain the MCE value */
cr4 = native_read_cr4() & X86_CR4_MCE;
@@ -1070,11 +1198,11 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
* VMPL level
* SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
*/
- vmsa->vmpl = 0;
+ vmsa->vmpl = snp_vmpl;
vmsa->sev_features = sev_status >> 2;
/* Switch the page over to a VMSA page now that it is initialized */
- ret = snp_set_vmsa(vmsa, true);
+ ret = snp_set_vmsa(vmsa, caa, apic_id, true);
if (ret) {
pr_err("set VMSA page failed (%u)\n", ret);
free_page((unsigned long)vmsa);
@@ -1090,7 +1218,10 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
vc_ghcb_invalidate(ghcb);
ghcb_set_rax(ghcb, vmsa->sev_features);
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
- ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE);
+ ghcb_set_sw_exit_info_1(ghcb,
+ ((u64)apic_id << 32) |
+ ((u64)snp_vmpl << 16) |
+ SVM_VMGEXIT_AP_CREATE);
ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
sev_es_wr_ghcb_msr(__pa(ghcb));
@@ -1108,13 +1239,13 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
/* Perform cleanup if there was an error */
if (ret) {
- snp_cleanup_vmsa(vmsa);
+ snp_cleanup_vmsa(vmsa, apic_id);
vmsa = NULL;
}
/* Free up any previous VMSA page */
if (cur_vmsa)
- snp_cleanup_vmsa(cur_vmsa);
+ snp_cleanup_vmsa(cur_vmsa, apic_id);
/* Record the current VMSA page */
per_cpu(sev_vmsa, cpu) = vmsa;
@@ -1209,6 +1340,17 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
/* Is it a WRMSR? */
exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0;
+ if (regs->cx == MSR_SVSM_CAA) {
+ /* Writes to the SVSM CAA msr are ignored */
+ if (exit_info_1)
+ return ES_OK;
+
+ regs->ax = lower_32_bits(this_cpu_read(svsm_caa_pa));
+ regs->dx = upper_32_bits(this_cpu_read(svsm_caa_pa));
+
+ return ES_OK;
+ }
+
ghcb_set_rcx(ghcb, regs->cx);
if (exit_info_1) {
ghcb_set_rax(ghcb, regs->ax);
@@ -1341,11 +1483,23 @@ static void __init alloc_runtime_data(int cpu)
{
struct sev_es_runtime_data *data;
- data = memblock_alloc(sizeof(*data), PAGE_SIZE);
+ data = memblock_alloc_node(sizeof(*data), PAGE_SIZE, cpu_to_node(cpu));
if (!data)
panic("Can't allocate SEV-ES runtime data");
per_cpu(runtime_data, cpu) = data;
+
+ if (snp_vmpl) {
+ struct svsm_ca *caa;
+
+ /* Allocate the SVSM CA page if an SVSM is present */
+ caa = memblock_alloc(sizeof(*caa), PAGE_SIZE);
+ if (!caa)
+ panic("Can't allocate SVSM CA page\n");
+
+ per_cpu(svsm_caa, cpu) = caa;
+ per_cpu(svsm_caa_pa, cpu) = __pa(caa);
+ }
}
static void __init init_ghcb(int cpu)
@@ -1395,6 +1549,32 @@ void __init sev_es_init_vc_handling(void)
init_ghcb(cpu);
}
+ /* If running under an SVSM, switch to the per-cpu CA */
+ if (snp_vmpl) {
+ struct svsm_call call = {};
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+
+ /*
+ * SVSM_CORE_REMAP_CA call:
+ * RAX = 0 (Protocol=0, CallID=0)
+ * RCX = New CA GPA
+ */
+ call.caa = svsm_get_caa();
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA);
+ call.rcx = this_cpu_read(svsm_caa_pa);
+ ret = svsm_perform_call_protocol(&call);
+ if (ret)
+ panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n",
+ ret, call.rax_out);
+
+ sev_cfg.use_cas = true;
+
+ local_irq_restore(flags);
+ }
+
sev_es_setup_play_dead();
/* Secondary CPUs use the runtime #VC handler */
@@ -1819,33 +1999,6 @@ static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
return result;
}
-static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
-{
- long error_code = ctxt->fi.error_code;
- int trapnr = ctxt->fi.vector;
-
- ctxt->regs->orig_ax = ctxt->fi.error_code;
-
- switch (trapnr) {
- case X86_TRAP_GP:
- exc_general_protection(ctxt->regs, error_code);
- break;
- case X86_TRAP_UD:
- exc_invalid_op(ctxt->regs);
- break;
- case X86_TRAP_PF:
- write_cr2(ctxt->fi.cr2);
- exc_page_fault(ctxt->regs, error_code);
- break;
- case X86_TRAP_AC:
- exc_alignment_check(ctxt->regs, error_code);
- break;
- default:
- pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
- BUG();
- }
-}
-
static __always_inline bool is_vc2_stack(unsigned long sp)
{
return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
@@ -2095,6 +2248,47 @@ found_cc_info:
return cc_info;
}
+static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
+{
+ struct svsm_call call = {};
+ int ret;
+ u64 pa;
+
+ /*
+ * Record the SVSM Calling Area address (CAA) if the guest is not
+ * running at VMPL0. The CA will be used to communicate with the
+ * SVSM to perform the SVSM services.
+ */
+ if (!svsm_setup_ca(cc_info))
+ return;
+
+ /*
+ * It is very early in the boot and the kernel is running identity
+ * mapped but without having adjusted the pagetables to where the
+ * kernel was loaded (physbase), so the get the CA address using
+ * RIP-relative addressing.
+ */
+ pa = (u64)&RIP_REL_REF(boot_svsm_ca_page);
+
+ /*
+ * Switch over to the boot SVSM CA while the current CA is still
+ * addressable. There is no GHCB at this point so use the MSR protocol.
+ *
+ * SVSM_CORE_REMAP_CA call:
+ * RAX = 0 (Protocol=0, CallID=0)
+ * RCX = New CA GPA
+ */
+ call.caa = svsm_get_caa();
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA);
+ call.rcx = pa;
+ ret = svsm_perform_call_protocol(&call);
+ if (ret)
+ panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", ret, call.rax_out);
+
+ RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa;
+ RIP_REL_REF(boot_svsm_caa_pa) = pa;
+}
+
bool __head snp_init(struct boot_params *bp)
{
struct cc_blob_sev_info *cc_info;
@@ -2108,6 +2302,8 @@ bool __head snp_init(struct boot_params *bp)
setup_cpuid_table(cc_info);
+ svsm_setup(cc_info);
+
/*
* The CC blob will be used later to access the secrets page. Cache
* it here like the boot kernel does.
@@ -2156,23 +2352,27 @@ static void dump_cpuid_table(void)
* expected, but that initialization happens too early in boot to print any
* sort of indicator, and there's not really any other good place to do it,
* so do it here.
+ *
+ * If running as an SNP guest, report the current VM privilege level (VMPL).
*/
-static int __init report_cpuid_table(void)
+static int __init report_snp_info(void)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
- if (!cpuid_table->count)
- return 0;
+ if (cpuid_table->count) {
+ pr_info("Using SNP CPUID table, %d entries present.\n",
+ cpuid_table->count);
- pr_info("Using SNP CPUID table, %d entries present.\n",
- cpuid_table->count);
+ if (sev_cfg.debug)
+ dump_cpuid_table();
+ }
- if (sev_cfg.debug)
- dump_cpuid_table();
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ pr_info("SNP running at VMPL%u.\n", snp_vmpl);
return 0;
}
-arch_initcall(report_cpuid_table);
+arch_initcall(report_snp_info);
static int __init init_sev_config(char *str)
{
@@ -2191,6 +2391,56 @@ static int __init init_sev_config(char *str)
}
__setup("sev=", init_sev_config);
+static void update_attest_input(struct svsm_call *call, struct svsm_attest_call *input)
+{
+ /* If (new) lengths have been returned, propagate them up */
+ if (call->rcx_out != call->rcx)
+ input->manifest_buf.len = call->rcx_out;
+
+ if (call->rdx_out != call->rdx)
+ input->certificates_buf.len = call->rdx_out;
+
+ if (call->r8_out != call->r8)
+ input->report_buf.len = call->r8_out;
+}
+
+int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call,
+ struct svsm_attest_call *input)
+{
+ struct svsm_attest_call *ac;
+ unsigned long flags;
+ u64 attest_call_pa;
+ int ret;
+
+ if (!snp_vmpl)
+ return -EINVAL;
+
+ local_irq_save(flags);
+
+ call->caa = svsm_get_caa();
+
+ ac = (struct svsm_attest_call *)call->caa->svsm_buffer;
+ attest_call_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer);
+
+ *ac = *input;
+
+ /*
+ * Set input registers for the request and set RDX and R8 to known
+ * values in order to detect length values being returned in them.
+ */
+ call->rax = call_id;
+ call->rcx = attest_call_pa;
+ call->rdx = -1;
+ call->r8 = -1;
+ ret = svsm_perform_call_protocol(call);
+ update_attest_input(call, input);
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req);
+
int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio)
{
struct ghcb_state state;
@@ -2299,3 +2549,58 @@ void sev_show_status(void)
}
pr_cont("\n");
}
+
+void __init snp_update_svsm_ca(void)
+{
+ if (!snp_vmpl)
+ return;
+
+ /* Update the CAA to a proper kernel address */
+ boot_svsm_caa = &boot_svsm_ca_page;
+}
+
+#ifdef CONFIG_SYSFS
+static ssize_t vmpl_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", snp_vmpl);
+}
+
+static struct kobj_attribute vmpl_attr = __ATTR_RO(vmpl);
+
+static struct attribute *vmpl_attrs[] = {
+ &vmpl_attr.attr,
+ NULL
+};
+
+static struct attribute_group sev_attr_group = {
+ .attrs = vmpl_attrs,
+};
+
+static int __init sev_sysfs_init(void)
+{
+ struct kobject *sev_kobj;
+ struct device *dev_root;
+ int ret;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return -ENODEV;
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (!dev_root)
+ return -ENODEV;
+
+ sev_kobj = kobject_create_and_add("sev", &dev_root->kobj);
+ put_device(dev_root);
+
+ if (!sev_kobj)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(sev_kobj, &sev_attr_group);
+ if (ret)
+ kobject_put(sev_kobj);
+
+ return ret;
+}
+arch_initcall(sev_sysfs_init);
+#endif // CONFIG_SYSFS
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/coco/sev/shared.c
index b4f8fa0f722c..71de53194089 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/coco/sev/shared.c
@@ -21,8 +21,30 @@
#define WARN(condition, format...) (!!(condition))
#define sev_printk(fmt, ...)
#define sev_printk_rtl(fmt, ...)
+#undef vc_forward_exception
+#define vc_forward_exception(c) panic("SNP: Hypervisor requested exception\n")
#endif
+/*
+ * SVSM related information:
+ * When running under an SVSM, the VMPL that Linux is executing at must be
+ * non-zero. The VMPL is therefore used to indicate the presence of an SVSM.
+ *
+ * During boot, the page tables are set up as identity mapped and later
+ * changed to use kernel virtual addresses. Maintain separate virtual and
+ * physical addresses for the CAA to allow SVSM functions to be used during
+ * early boot, both with identity mapped virtual addresses and proper kernel
+ * virtual addresses.
+ */
+u8 snp_vmpl __ro_after_init;
+EXPORT_SYMBOL_GPL(snp_vmpl);
+static struct svsm_ca *boot_svsm_caa __ro_after_init;
+static u64 boot_svsm_caa_pa __ro_after_init;
+
+static struct svsm_ca *svsm_get_caa(void);
+static u64 svsm_get_caa_pa(void);
+static int svsm_perform_call_protocol(struct svsm_call *call);
+
/* I/O parameters for CPUID-related helpers */
struct cpuid_leaf {
u32 fn;
@@ -229,6 +251,126 @@ static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt
return ES_VMM_ERROR;
}
+static inline int svsm_process_result_codes(struct svsm_call *call)
+{
+ switch (call->rax_out) {
+ case SVSM_SUCCESS:
+ return 0;
+ case SVSM_ERR_INCOMPLETE:
+ case SVSM_ERR_BUSY:
+ return -EAGAIN;
+ default:
+ return -EINVAL;
+ }
+}
+
+/*
+ * Issue a VMGEXIT to call the SVSM:
+ * - Load the SVSM register state (RAX, RCX, RDX, R8 and R9)
+ * - Set the CA call pending field to 1
+ * - Issue VMGEXIT
+ * - Save the SVSM return register state (RAX, RCX, RDX, R8 and R9)
+ * - Perform atomic exchange of the CA call pending field
+ *
+ * - See the "Secure VM Service Module for SEV-SNP Guests" specification for
+ * details on the calling convention.
+ * - The calling convention loosely follows the Microsoft X64 calling
+ * convention by putting arguments in RCX, RDX, R8 and R9.
+ * - RAX specifies the SVSM protocol/callid as input and the return code
+ * as output.
+ */
+static __always_inline void svsm_issue_call(struct svsm_call *call, u8 *pending)
+{
+ register unsigned long rax asm("rax") = call->rax;
+ register unsigned long rcx asm("rcx") = call->rcx;
+ register unsigned long rdx asm("rdx") = call->rdx;
+ register unsigned long r8 asm("r8") = call->r8;
+ register unsigned long r9 asm("r9") = call->r9;
+
+ call->caa->call_pending = 1;
+
+ asm volatile("rep; vmmcall\n\t"
+ : "+r" (rax), "+r" (rcx), "+r" (rdx), "+r" (r8), "+r" (r9)
+ : : "memory");
+
+ *pending = xchg(&call->caa->call_pending, *pending);
+
+ call->rax_out = rax;
+ call->rcx_out = rcx;
+ call->rdx_out = rdx;
+ call->r8_out = r8;
+ call->r9_out = r9;
+}
+
+static int svsm_perform_msr_protocol(struct svsm_call *call)
+{
+ u8 pending = 0;
+ u64 val, resp;
+
+ /*
+ * When using the MSR protocol, be sure to save and restore
+ * the current MSR value.
+ */
+ val = sev_es_rd_ghcb_msr();
+
+ sev_es_wr_ghcb_msr(GHCB_MSR_VMPL_REQ_LEVEL(0));
+
+ svsm_issue_call(call, &pending);
+
+ resp = sev_es_rd_ghcb_msr();
+
+ sev_es_wr_ghcb_msr(val);
+
+ if (pending)
+ return -EINVAL;
+
+ if (GHCB_RESP_CODE(resp) != GHCB_MSR_VMPL_RESP)
+ return -EINVAL;
+
+ if (GHCB_MSR_VMPL_RESP_VAL(resp))
+ return -EINVAL;
+
+ return svsm_process_result_codes(call);
+}
+
+static int svsm_perform_ghcb_protocol(struct ghcb *ghcb, struct svsm_call *call)
+{
+ struct es_em_ctxt ctxt;
+ u8 pending = 0;
+
+ vc_ghcb_invalidate(ghcb);
+
+ /*
+ * Fill in protocol and format specifiers. This can be called very early
+ * in the boot, so use rip-relative references as needed.
+ */
+ ghcb->protocol_version = RIP_REL_REF(ghcb_version);
+ ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
+
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_SNP_RUN_VMPL);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+
+ svsm_issue_call(call, &pending);
+
+ if (pending)
+ return -EINVAL;
+
+ switch (verify_exception_info(ghcb, &ctxt)) {
+ case ES_OK:
+ break;
+ case ES_EXCEPTION:
+ vc_forward_exception(&ctxt);
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ return svsm_process_result_codes(call);
+}
+
static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
struct es_em_ctxt *ctxt,
u64 exit_code, u64 exit_info_1,
@@ -1079,38 +1221,268 @@ static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
}
}
-static void pvalidate_pages(struct snp_psc_desc *desc)
+static inline void __pval_terminate(u64 pfn, bool action, unsigned int page_size,
+ int ret, u64 svsm_ret)
+{
+ WARN(1, "PVALIDATE failure: pfn: 0x%llx, action: %u, size: %u, ret: %d, svsm_ret: 0x%llx\n",
+ pfn, action, page_size, ret, svsm_ret);
+
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+}
+
+static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svsm_ret)
+{
+ unsigned int page_size;
+ bool action;
+ u64 pfn;
+
+ pfn = pc->entry[pc->cur_index].pfn;
+ action = pc->entry[pc->cur_index].action;
+ page_size = pc->entry[pc->cur_index].page_size;
+
+ __pval_terminate(pfn, action, page_size, ret, svsm_ret);
+}
+
+static void svsm_pval_4k_page(unsigned long paddr, bool validate)
+{
+ struct svsm_pvalidate_call *pc;
+ struct svsm_call call = {};
+ unsigned long flags;
+ u64 pc_pa;
+ int ret;
+
+ /*
+ * This can be called very early in the boot, use native functions in
+ * order to avoid paravirt issues.
+ */
+ flags = native_local_irq_save();
+
+ call.caa = svsm_get_caa();
+
+ pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer;
+ pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer);
+
+ pc->num_entries = 1;
+ pc->cur_index = 0;
+ pc->entry[0].page_size = RMP_PG_SIZE_4K;
+ pc->entry[0].action = validate;
+ pc->entry[0].ignore_cf = 0;
+ pc->entry[0].pfn = paddr >> PAGE_SHIFT;
+
+ /* Protocol 0, Call ID 1 */
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE);
+ call.rcx = pc_pa;
+
+ ret = svsm_perform_call_protocol(&call);
+ if (ret)
+ svsm_pval_terminate(pc, ret, call.rax_out);
+
+ native_local_irq_restore(flags);
+}
+
+static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool validate)
+{
+ int ret;
+
+ /*
+ * This can be called very early during boot, so use rIP-relative
+ * references as needed.
+ */
+ if (RIP_REL_REF(snp_vmpl)) {
+ svsm_pval_4k_page(paddr, validate);
+ } else {
+ ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
+ if (ret)
+ __pval_terminate(PHYS_PFN(paddr), validate, RMP_PG_SIZE_4K, ret, 0);
+ }
+}
+
+static void pval_pages(struct snp_psc_desc *desc)
{
struct psc_entry *e;
unsigned long vaddr;
unsigned int size;
unsigned int i;
bool validate;
+ u64 pfn;
int rc;
for (i = 0; i <= desc->hdr.end_entry; i++) {
e = &desc->entries[i];
- vaddr = (unsigned long)pfn_to_kaddr(e->gfn);
+ pfn = e->gfn;
+ vaddr = (unsigned long)pfn_to_kaddr(pfn);
size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K;
validate = e->operation == SNP_PAGE_STATE_PRIVATE;
rc = pvalidate(vaddr, size, validate);
+ if (!rc)
+ continue;
+
if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) {
unsigned long vaddr_end = vaddr + PMD_SIZE;
- for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) {
+ for (; vaddr < vaddr_end; vaddr += PAGE_SIZE, pfn++) {
rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
if (rc)
- break;
+ __pval_terminate(pfn, validate, RMP_PG_SIZE_4K, rc, 0);
}
+ } else {
+ __pval_terminate(pfn, validate, size, rc, 0);
}
+ }
+}
+
+static u64 svsm_build_ca_from_pfn_range(u64 pfn, u64 pfn_end, bool action,
+ struct svsm_pvalidate_call *pc)
+{
+ struct svsm_pvalidate_entry *pe;
+
+ /* Nothing in the CA yet */
+ pc->num_entries = 0;
+ pc->cur_index = 0;
+
+ pe = &pc->entry[0];
+
+ while (pfn < pfn_end) {
+ pe->page_size = RMP_PG_SIZE_4K;
+ pe->action = action;
+ pe->ignore_cf = 0;
+ pe->pfn = pfn;
+
+ pe++;
+ pfn++;
+
+ pc->num_entries++;
+ if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT)
+ break;
+ }
+
+ return pfn;
+}
+
+static int svsm_build_ca_from_psc_desc(struct snp_psc_desc *desc, unsigned int desc_entry,
+ struct svsm_pvalidate_call *pc)
+{
+ struct svsm_pvalidate_entry *pe;
+ struct psc_entry *e;
+
+ /* Nothing in the CA yet */
+ pc->num_entries = 0;
+ pc->cur_index = 0;
+
+ pe = &pc->entry[0];
+ e = &desc->entries[desc_entry];
+
+ while (desc_entry <= desc->hdr.end_entry) {
+ pe->page_size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K;
+ pe->action = e->operation == SNP_PAGE_STATE_PRIVATE;
+ pe->ignore_cf = 0;
+ pe->pfn = e->gfn;
+
+ pe++;
+ e++;
+
+ desc_entry++;
+ pc->num_entries++;
+ if (pc->num_entries == SVSM_PVALIDATE_MAX_COUNT)
+ break;
+ }
+
+ return desc_entry;
+}
+
+static void svsm_pval_pages(struct snp_psc_desc *desc)
+{
+ struct svsm_pvalidate_entry pv_4k[VMGEXIT_PSC_MAX_ENTRY];
+ unsigned int i, pv_4k_count = 0;
+ struct svsm_pvalidate_call *pc;
+ struct svsm_call call = {};
+ unsigned long flags;
+ bool action;
+ u64 pc_pa;
+ int ret;
+
+ /*
+ * This can be called very early in the boot, use native functions in
+ * order to avoid paravirt issues.
+ */
+ flags = native_local_irq_save();
+
+ /*
+ * The SVSM calling area (CA) can support processing 510 entries at a
+ * time. Loop through the Page State Change descriptor until the CA is
+ * full or the last entry in the descriptor is reached, at which time
+ * the SVSM is invoked. This repeats until all entries in the descriptor
+ * are processed.
+ */
+ call.caa = svsm_get_caa();
+
+ pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer;
+ pc_pa = svsm_get_caa_pa() + offsetof(struct svsm_ca, svsm_buffer);
+
+ /* Protocol 0, Call ID 1 */
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE);
+ call.rcx = pc_pa;
+
+ for (i = 0; i <= desc->hdr.end_entry;) {
+ i = svsm_build_ca_from_psc_desc(desc, i, pc);
+
+ do {
+ ret = svsm_perform_call_protocol(&call);
+ if (!ret)
+ continue;
+
+ /*
+ * Check if the entry failed because of an RMP mismatch (a
+ * PVALIDATE at 2M was requested, but the page is mapped in
+ * the RMP as 4K).
+ */
- if (rc) {
- WARN(1, "Failed to validate address 0x%lx ret %d", vaddr, rc);
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+ if (call.rax_out == SVSM_PVALIDATE_FAIL_SIZEMISMATCH &&
+ pc->entry[pc->cur_index].page_size == RMP_PG_SIZE_2M) {
+ /* Save this entry for post-processing at 4K */
+ pv_4k[pv_4k_count++] = pc->entry[pc->cur_index];
+
+ /* Skip to the next one unless at the end of the list */
+ pc->cur_index++;
+ if (pc->cur_index < pc->num_entries)
+ ret = -EAGAIN;
+ else
+ ret = 0;
+ }
+ } while (ret == -EAGAIN);
+
+ if (ret)
+ svsm_pval_terminate(pc, ret, call.rax_out);
+ }
+
+ /* Process any entries that failed to be validated at 2M and validate them at 4K */
+ for (i = 0; i < pv_4k_count; i++) {
+ u64 pfn, pfn_end;
+
+ action = pv_4k[i].action;
+ pfn = pv_4k[i].pfn;
+ pfn_end = pfn + 512;
+
+ while (pfn < pfn_end) {
+ pfn = svsm_build_ca_from_pfn_range(pfn, pfn_end, action, pc);
+
+ ret = svsm_perform_call_protocol(&call);
+ if (ret)
+ svsm_pval_terminate(pc, ret, call.rax_out);
}
}
+
+ native_local_irq_restore(flags);
+}
+
+static void pvalidate_pages(struct snp_psc_desc *desc)
+{
+ if (snp_vmpl)
+ svsm_pval_pages(desc);
+ else
+ pval_pages(desc);
}
static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc)
@@ -1269,3 +1641,77 @@ static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt,
return ES_UNSUPPORTED;
}
+
+/*
+ * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM
+ * services needed when not running in VMPL0.
+ */
+static bool __head svsm_setup_ca(const struct cc_blob_sev_info *cc_info)
+{
+ struct snp_secrets_page *secrets_page;
+ struct snp_cpuid_table *cpuid_table;
+ unsigned int i;
+ u64 caa;
+
+ BUILD_BUG_ON(sizeof(*secrets_page) != PAGE_SIZE);
+
+ /*
+ * Check if running at VMPL0.
+ *
+ * Use RMPADJUST (see the rmpadjust() function for a description of what
+ * the instruction does) to update the VMPL1 permissions of a page. If
+ * the guest is running at VMPL0, this will succeed and implies there is
+ * no SVSM. If the guest is running at any other VMPL, this will fail.
+ * Linux SNP guests only ever run at a single VMPL level so permission mask
+ * changes of a lesser-privileged VMPL are a don't-care.
+ *
+ * Use a rip-relative reference to obtain the proper address, since this
+ * routine is running identity mapped when called, both by the decompressor
+ * code and the early kernel code.
+ */
+ if (!rmpadjust((unsigned long)&RIP_REL_REF(boot_ghcb_page), RMP_PG_SIZE_4K, 1))
+ return false;
+
+ /*
+ * Not running at VMPL0, ensure everything has been properly supplied
+ * for running under an SVSM.
+ */
+ if (!cc_info || !cc_info->secrets_phys || cc_info->secrets_len != PAGE_SIZE)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECRETS_PAGE);
+
+ secrets_page = (struct snp_secrets_page *)cc_info->secrets_phys;
+ if (!secrets_page->svsm_size)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NO_SVSM);
+
+ if (!secrets_page->svsm_guest_vmpl)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_VMPL0);
+
+ RIP_REL_REF(snp_vmpl) = secrets_page->svsm_guest_vmpl;
+
+ caa = secrets_page->svsm_caa;
+
+ /*
+ * An open-coded PAGE_ALIGNED() in order to avoid including
+ * kernel-proper headers into the decompressor.
+ */
+ if (caa & (PAGE_SIZE - 1))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CAA);
+
+ /*
+ * The CA is identity mapped when this routine is called, both by the
+ * decompressor code and the early kernel code.
+ */
+ RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)caa;
+ RIP_REL_REF(boot_svsm_caa_pa) = caa;
+
+ /* Advertise the SVSM presence via CPUID. */
+ cpuid_table = (struct snp_cpuid_table *)snp_cpuid_get_table();
+ for (i = 0; i < cpuid_table->count; i++) {
+ struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
+
+ if (fn->eax_in == 0x8000001f)
+ fn->eax |= BIT(28);
+ }
+
+ return true;
+}
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index c1cb90369915..327c45c5013f 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -7,6 +7,7 @@
#include <linux/cpufeature.h>
#include <linux/export.h>
#include <linux/io.h>
+#include <linux/kexec.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
@@ -14,6 +15,8 @@
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/pgtable.h>
+#include <asm/set_memory.h>
+#include <asm/traps.h>
/* MMIO direction */
#define EPT_READ 0
@@ -38,6 +41,8 @@
#define TDREPORT_SUBTYPE_0 0
+static atomic_long_t nr_shared;
+
/* Called from __tdx_hypercall() for unrecoverable failure */
noinstr void __noreturn __tdx_hypercall_failed(void)
{
@@ -385,7 +390,6 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val)
.r12 = size,
.r13 = EPT_READ,
.r14 = addr,
- .r15 = *val,
};
if (__tdx_hypercall(&args))
@@ -430,6 +434,11 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
return -EINVAL;
}
+ if (!fault_in_kernel_space(ve->gla)) {
+ WARN_ONCE(1, "Access to userspace address is not supported");
+ return -EINVAL;
+ }
+
/*
* Reject EPT violation #VEs that split pages.
*
@@ -798,28 +807,124 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
return true;
}
-static bool tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
- bool enc)
+static int tdx_enc_status_change_prepare(unsigned long vaddr, int numpages,
+ bool enc)
{
/*
* Only handle shared->private conversion here.
* See the comment in tdx_early_init().
*/
- if (enc)
- return tdx_enc_status_changed(vaddr, numpages, enc);
- return true;
+ if (enc && !tdx_enc_status_changed(vaddr, numpages, enc))
+ return -EIO;
+
+ return 0;
}
-static bool tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
+static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
bool enc)
{
/*
* Only handle private->shared conversion here.
* See the comment in tdx_early_init().
*/
- if (!enc)
- return tdx_enc_status_changed(vaddr, numpages, enc);
- return true;
+ if (!enc && !tdx_enc_status_changed(vaddr, numpages, enc))
+ return -EIO;
+
+ if (enc)
+ atomic_long_sub(numpages, &nr_shared);
+ else
+ atomic_long_add(numpages, &nr_shared);
+
+ return 0;
+}
+
+/* Stop new private<->shared conversions */
+static void tdx_kexec_begin(void)
+{
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
+ /*
+ * Crash kernel reaches here with interrupts disabled: can't wait for
+ * conversions to finish.
+ *
+ * If race happened, just report and proceed.
+ */
+ if (!set_memory_enc_stop_conversion())
+ pr_warn("Failed to stop shared<->private conversions\n");
+}
+
+/* Walk direct mapping and convert all shared memory back to private */
+static void tdx_kexec_finish(void)
+{
+ unsigned long addr, end;
+ long found = 0, shared;
+
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
+ lockdep_assert_irqs_disabled();
+
+ addr = PAGE_OFFSET;
+ end = PAGE_OFFSET + get_max_mapped();
+
+ while (addr < end) {
+ unsigned long size;
+ unsigned int level;
+ pte_t *pte;
+
+ pte = lookup_address(addr, &level);
+ size = page_level_size(level);
+
+ if (pte && pte_decrypted(*pte)) {
+ int pages = size / PAGE_SIZE;
+
+ /*
+ * Touching memory with shared bit set triggers implicit
+ * conversion to shared.
+ *
+ * Make sure nobody touches the shared range from
+ * now on.
+ */
+ set_pte(pte, __pte(0));
+
+ /*
+ * Memory encryption state persists across kexec.
+ * If tdx_enc_status_changed() fails in the first
+ * kernel, it leaves memory in an unknown state.
+ *
+ * If that memory remains shared, accessing it in the
+ * *next* kernel through a private mapping will result
+ * in an unrecoverable guest shutdown.
+ *
+ * The kdump kernel boot is not impacted as it uses
+ * a pre-reserved memory range that is always private.
+ * However, gathering crash information could lead to
+ * a crash if it accesses unconverted memory through
+ * a private mapping which is possible when accessing
+ * that memory through /proc/vmcore, for example.
+ *
+ * In all cases, print error info in order to leave
+ * enough bread crumbs for debugging.
+ */
+ if (!tdx_enc_status_changed(addr, pages, true)) {
+ pr_err("Failed to unshare range %#lx-%#lx\n",
+ addr, addr + size);
+ }
+
+ found += pages;
+ }
+
+ addr += size;
+ }
+
+ __flush_tlb_all();
+
+ shared = atomic_long_read(&nr_shared);
+ if (shared != found) {
+ pr_err("shared page accounting is off\n");
+ pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
+ }
}
void __init tdx_early_init(void)
@@ -881,6 +986,9 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
+ x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
+ x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
+
/*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled
diff --git a/arch/x86/configs/hardening.config b/arch/x86/configs/hardening.config
index 7b497f3b7bc3..de319852a1e9 100644
--- a/arch/x86/configs/hardening.config
+++ b/arch/x86/configs/hardening.config
@@ -10,5 +10,8 @@ CONFIG_INTEL_IOMMU_DEFAULT_ON=y
CONFIG_INTEL_IOMMU_SVM=y
CONFIG_AMD_IOMMU=y
+# Enforce CET Indirect Branch Tracking in the kernel.
+CONFIG_X86_KERNEL_IBT=y
+
# Enable CET Shadow Stack for userspace.
CONFIG_X86_USER_SHADOW_STACK=y
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
index be3ee4294903..aabafa3faa6d 100644
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
@@ -1,6 +1,2 @@
CONFIG_NOHIGHMEM=y
-# CONFIG_HIGHMEM4G is not set
-# CONFIG_HIGHMEM64G is not set
-# CONFIG_UNWINDER_ORC is not set
CONFIG_UNWINDER_GUESS=y
-# CONFIG_UNWINDER_FRAME_POINTER is not set
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index c9e59589a1ce..7b1bebed879d 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -14,20 +14,25 @@ config CRYPTO_CURVE25519_X86
- ADX (large integer arithmetic)
config CRYPTO_AES_NI_INTEL
- tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTR, XTS, GCM (AES-NI)"
+ tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)"
depends on X86
select CRYPTO_AEAD
select CRYPTO_LIB_AES
+ select CRYPTO_LIB_GF128MUL
select CRYPTO_ALGAPI
select CRYPTO_SKCIPHER
select CRYPTO_SIMD
help
Block cipher: AES cipher algorithms
AEAD cipher: AES with GCM
- Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTR, XTS
+ Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XCTR, XTS
Architecture: x86 (32-bit and 64-bit) using:
- AES-NI (AES new instructions)
+ - VAES (Vector AES)
+
+ Some algorithm implementations are supported only in 64-bit builds,
+ and some have additional prerequisites such as AVX2 or AVX512.
config CRYPTO_BLOWFISH_X86_64
tristate "Ciphers: Blowfish, modes: ECB, CBC"
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9aa46093c91b..53b4a277809e 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -48,7 +48,12 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
+aesni-intel-$(CONFIG_64BIT) += aes_ctrby8_avx-x86_64.o \
+ aes-gcm-aesni-x86_64.o \
+ aes-xts-avx-x86_64.o
+ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy)
+aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o
+endif
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
new file mode 100644
index 000000000000..45940e2883a0
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
@@ -0,0 +1,1128 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// AES-NI optimized AES-GCM for x86_64
+//
+// Copyright 2024 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+//
+// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
+// support the original set of AES instructions, i.e. AES-NI. Two
+// implementations are provided, one that uses AVX and one that doesn't. They
+// are very similar, being generated by the same macros. The only difference is
+// that the AVX implementation takes advantage of VEX-coded instructions in some
+// places to avoid some 'movdqu' and 'movdqa' instructions. The AVX
+// implementation does *not* use 256-bit vectors, as AES is not supported on
+// 256-bit vectors until the VAES feature (which this file doesn't target).
+//
+// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
+// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems
+// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
+//
+// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
+// more thoroughly commented. This file has the following notable changes:
+//
+// - The vector length is fixed at 128-bit, i.e. xmm registers. This means
+// there is only one AES block (and GHASH block) per register.
+//
+// - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
+// 32. We work around this by being much more careful about using
+// registers, relying heavily on loads to load values as they are needed.
+//
+// - Masking is not available either. We work around this by implementing
+// partial block loads and stores using overlapping scalar loads and stores
+// combined with shifts and SSE4.1 insertion and extraction instructions.
+//
+// - The main loop is organized differently due to the different design
+// constraints. First, with just one AES block per SIMD register, on some
+// CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore
+// do an 8-register wide loop. Considering that and the fact that we have
+// just 16 SIMD registers to work with, it's not feasible to cache AES
+// round keys and GHASH key powers in registers across loop iterations.
+// That's not ideal, but also not actually that bad, since loads can run in
+// parallel with other instructions. Significantly, this also makes it
+// possible to roll up the inner loops, relying on hardware loop unrolling
+// instead of software loop unrolling, greatly reducing code size.
+//
+// - We implement the GHASH multiplications in the main loop using Karatsuba
+// multiplication instead of schoolbook multiplication. This saves one
+// pclmulqdq instruction per block, at the cost of one 64-bit load, one
+// pshufd, and 0.25 pxors per block. (This is without the three-argument
+// XOR support that would be provided by AVX512 / AVX10, which would be
+// more beneficial to schoolbook than Karatsuba.)
+//
+// As a rough approximation, we can assume that Karatsuba multiplication is
+// faster than schoolbook multiplication in this context if one pshufd and
+// 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit
+// load is "free" due to running in parallel with arithmetic instructions.)
+// This is true on AMD CPUs, including all that support pclmulqdq up to at
+// least Zen 3. It's also true on older Intel CPUs: Westmere through
+// Haswell on the Core side, and Silvermont through Goldmont Plus on the
+// low-power side. On some of these CPUs, pclmulqdq is quite slow, and the
+// benefit of Karatsuba should be substantial. On newer Intel CPUs,
+// schoolbook multiplication should be faster, but only marginally.
+//
+// Not all these CPUs were available to be tested. However, benchmarks on
+// available CPUs suggest that this approximation is plausible. Switching
+// to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
+// Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
+// Considering that and the fact that Karatsuba should be even more
+// beneficial on older Intel CPUs, it seems like the right choice here.
+//
+// An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
+// saved by using a multiplication-less reduction method. We don't do that
+// because it would require a large number of shift and xor instructions,
+// making it less worthwhile and likely harmful on newer CPUs.
+//
+// It does make sense to sometimes use a different reduction optimization
+// that saves a pclmulqdq, though: precompute the hash key times x^64, and
+// multiply the low half of the data block by the hash key with the extra
+// factor of x^64. This eliminates one step of the reduction. However,
+// this is incompatible with Karatsuba multiplication. Therefore, for
+// multi-block processing we use Karatsuba multiplication with a regular
+// reduction. For single-block processing, we use the x^64 optimization.
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 4
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+.Lgfpoly:
+ .quad 0xc200000000000000
+.Lone:
+ .quad 1
+.Lgfpoly_and_internal_carrybit:
+ .octa 0xc2000000000000010000000000000001
+ // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
+ // 'len' 0xff bytes and the rest zeroes.
+.Lzeropad_mask:
+ .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0
+
+// Offsets in struct aes_gcm_key_aesni
+#define OFFSETOF_AESKEYLEN 480
+#define OFFSETOF_H_POWERS 496
+#define OFFSETOF_H_POWERS_XORED 624
+#define OFFSETOF_H_TIMES_X64 688
+
+.text
+
+// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback
+// assumes that all operands are distinct and that any mem operand is aligned.
+.macro _vpclmulqdq imm, src1, src2, dst
+.if USE_AVX
+ vpclmulqdq \imm, \src1, \src2, \dst
+.else
+ movdqa \src2, \dst
+ pclmulqdq \imm, \src1, \dst
+.endif
+.endm
+
+// Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes
+// that all operands are distinct and that any mem operand is aligned.
+.macro _vpshufb src1, src2, dst
+.if USE_AVX
+ vpshufb \src1, \src2, \dst
+.else
+ movdqa \src2, \dst
+ pshufb \src1, \dst
+.endif
+.endm
+
+// Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that
+// all operands are distinct.
+.macro _vpand src1, src2, dst
+.if USE_AVX
+ vpand \src1, \src2, \dst
+.else
+ movdqu \src1, \dst
+ pand \src2, \dst
+.endif
+.endm
+
+// XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must
+// be a temporary xmm register.
+.macro _xor_mem_to_reg mem, reg, tmp
+.if USE_AVX
+ vpxor \mem, \reg, \reg
+.else
+ movdqu \mem, \tmp
+ pxor \tmp, \reg
+.endif
+.endm
+
+// Test the unaligned memory operand \mem against the xmm register \reg. \tmp
+// must be a temporary xmm register.
+.macro _test_mem mem, reg, tmp
+.if USE_AVX
+ vptest \mem, \reg
+.else
+ movdqu \mem, \tmp
+ ptest \tmp, \reg
+.endif
+.endm
+
+// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
+// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro _load_partial_block src, dst, tmp64, tmp32
+ sub $8, %ecx // LEN - 8
+ jle .Lle8\@
+
+ // Load 9 <= LEN <= 15 bytes.
+ movq (\src), \dst // Load first 8 bytes
+ mov (\src, %rcx), %rax // Load last 8 bytes
+ neg %ecx
+ shl $3, %ecx
+ shr %cl, %rax // Discard overlapping bytes
+ pinsrq $1, %rax, \dst
+ jmp .Ldone\@
+
+.Lle8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Load 4 <= LEN <= 8 bytes.
+ mov (\src), %eax // Load first 4 bytes
+ mov (\src, %rcx), \tmp32 // Load last 4 bytes
+ jmp .Lcombine\@
+
+.Llt4\@:
+ // Load 1 <= LEN <= 3 bytes.
+ add $2, %ecx // LEN - 2
+ movzbl (\src), %eax // Load first byte
+ jl .Lmovq\@
+ movzwl (\src, %rcx), \tmp32 // Load last 2 bytes
+.Lcombine\@:
+ shl $3, %ecx
+ shl %cl, \tmp64
+ or \tmp64, %rax // Combine the two parts
+.Lmovq\@:
+ movq %rax, \dst
+.Ldone\@:
+.endm
+
+// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
+// Clobbers %rax, %rcx, and %rsi.
+.macro _store_partial_block src, dst
+ sub $8, %ecx // LEN - 8
+ jl .Llt8\@
+
+ // Store 8 <= LEN <= 15 bytes.
+ pextrq $1, \src, %rax
+ mov %ecx, %esi
+ shl $3, %ecx
+ ror %cl, %rax
+ mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes
+ movq \src, (\dst) // Store first 8 bytes
+ jmp .Ldone\@
+
+.Llt8\@:
+ add $4, %ecx // LEN - 4
+ jl .Llt4\@
+
+ // Store 4 <= LEN <= 7 bytes.
+ pextrd $1, \src, %eax
+ mov %ecx, %esi
+ shl $3, %ecx
+ ror %cl, %eax
+ mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes
+ movd \src, (\dst) // Store first 4 bytes
+ jmp .Ldone\@
+
+.Llt4\@:
+ // Store 1 <= LEN <= 3 bytes.
+ pextrb $0, \src, 0(\dst)
+ cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
+ jl .Ldone\@
+ pextrb $1, \src, 1(\dst)
+ je .Ldone\@
+ pextrb $2, \src, 2(\dst)
+.Ldone\@:
+.endm
+
+// Do one step of GHASH-multiplying \a by \b and storing the reduced product in
+// \b. To complete all steps, this must be invoked with \i=0 through \i=9.
+// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
+// .Lgfpoly constant, and \t0-\t1 must be temporary registers.
+.macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1
+
+ // MI = (a_L * b_H) + ((a*x^64)_L * b_L)
+.if \i == 0
+ _vpclmulqdq $0x01, \a, \b, \t0
+.elseif \i == 1
+ _vpclmulqdq $0x00, \a_times_x64, \b, \t1
+.elseif \i == 2
+ pxor \t1, \t0
+
+ // HI = (a_H * b_H) + ((a*x^64)_H * b_L)
+.elseif \i == 3
+ _vpclmulqdq $0x11, \a, \b, \t1
+.elseif \i == 4
+ pclmulqdq $0x10, \a_times_x64, \b
+.elseif \i == 5
+ pxor \t1, \b
+.elseif \i == 6
+
+ // Fold MI into HI.
+ pshufd $0x4e, \t0, \t1 // Swap halves of MI
+.elseif \i == 7
+ pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+ pxor \t1, \b
+.elseif \i == 9
+ pxor \t0, \b
+.endif
+.endm
+
+// GHASH-multiply \a by \b and store the reduced product in \b.
+// See _ghash_mul_step for details.
+.macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
+.endr
+.endm
+
+// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
+// This does Karatsuba multiplication and must be paired with _ghash_reduce. On
+// the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the
+// two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered.
+.macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0
+
+ // LO += a_L * b_L
+ _vpclmulqdq $0x00, \a, \b, \t0
+ pxor \t0, \lo
+
+ // b_L + b_H
+ pshufd $0x4e, \b, \t0
+ pxor \b, \t0
+
+ // HI += a_H * b_H
+ pclmulqdq $0x11, \a, \b
+ pxor \b, \hi
+
+ // MI += (a_L + a_H) * (b_L + b_H)
+ pclmulqdq $0x00, \a_xored, \t0
+ pxor \t0, \mi
+.endm
+
+// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
+// This assumes that _ghash_mul_noreduce was used.
+.macro _ghash_reduce lo, mi, hi, dst, t0
+
+ movq .Lgfpoly(%rip), \t0
+
+ // MI += LO + HI (needed because we used Karatsuba multiplication)
+ pxor \lo, \mi
+ pxor \hi, \mi
+
+ // Fold LO into MI.
+ pshufd $0x4e, \lo, \dst
+ pclmulqdq $0x00, \t0, \lo
+ pxor \dst, \mi
+ pxor \lo, \mi
+
+ // Fold MI into HI.
+ pshufd $0x4e, \mi, \dst
+ pclmulqdq $0x00, \t0, \mi
+ pxor \hi, \dst
+ pxor \mi, \dst
+.endm
+
+// Do the first step of the GHASH update of a set of 8 ciphertext blocks.
+//
+// The whole GHASH update does:
+//
+// GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
+// blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
+//
+// This macro just does the first step: it does the unreduced multiplication
+// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
+// registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the
+// inner block counter in %rax, which is a value that counts up by 8 for each
+// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
+//
+// To reduce the number of pclmulqdq instructions required, both this macro and
+// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
+// multiplication. See the file comment for more details about this choice.
+//
+// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
+// encrypting, or SRC if decrypting. They also expect the precomputed hash key
+// powers H^i and their XOR'd-together halves to be available in the struct
+// pointed to by KEY. Both macros clobber TMP[0-2].
+.macro _ghash_update_begin_8x enc
+
+ // Initialize the inner block counter.
+ xor %eax, %eax
+
+ // Load the highest hash key power, H^8.
+ movdqa OFFSETOF_H_POWERS(KEY), TMP0
+
+ // Load the first ciphertext block and byte-reflect it.
+.if \enc
+ movdqu (DST), TMP1
+.else
+ movdqu (SRC), TMP1
+.endif
+ pshufb BSWAP_MASK, TMP1
+
+ // Add the GHASH accumulator to the ciphertext block to get the block
+ // 'b' that needs to be multiplied with the hash key power 'a'.
+ pxor TMP1, GHASH_ACC
+
+ // b_L + b_H
+ pshufd $0x4e, GHASH_ACC, MI
+ pxor GHASH_ACC, MI
+
+ // LO = a_L * b_L
+ _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO
+
+ // HI = a_H * b_H
+ pclmulqdq $0x11, TMP0, GHASH_ACC
+
+ // MI = (a_L + a_H) * (b_L + b_H)
+ pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
+.endm
+
+// Continue the GHASH update of 8 ciphertext blocks as described above by doing
+// an unreduced multiplication of the next ciphertext block by the next lowest
+// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
+.macro _ghash_update_continue_8x enc
+ add $8, %eax
+
+ // Load the next lowest key power.
+ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0
+
+ // Load the next ciphertext block and byte-reflect it.
+.if \enc
+ movdqu (DST,%rax,2), TMP1
+.else
+ movdqu (SRC,%rax,2), TMP1
+.endif
+ pshufb BSWAP_MASK, TMP1
+
+ // LO += a_L * b_L
+ _vpclmulqdq $0x00, TMP0, TMP1, TMP2
+ pxor TMP2, LO
+
+ // b_L + b_H
+ pshufd $0x4e, TMP1, TMP2
+ pxor TMP1, TMP2
+
+ // HI += a_H * b_H
+ pclmulqdq $0x11, TMP0, TMP1
+ pxor TMP1, GHASH_ACC
+
+ // MI += (a_L + a_H) * (b_L + b_H)
+ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
+ pclmulqdq $0x00, TMP1, TMP2
+ pxor TMP2, MI
+.endm
+
+// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to
+// _ghash_reduce, but it's hardcoded to use the registers of the main loop and
+// it uses the same register for HI and the destination. It's also divided into
+// two steps. TMP1 must be preserved across steps.
+//
+// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
+// shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would
+// increase the critical path length, and it seems to slightly hurt performance.
+.macro _ghash_update_end_8x_step i
+.if \i == 0
+ movq .Lgfpoly(%rip), TMP1
+ pxor LO, MI
+ pxor GHASH_ACC, MI
+ pshufd $0x4e, LO, TMP2
+ pclmulqdq $0x00, TMP1, LO
+ pxor TMP2, MI
+ pxor LO, MI
+.elseif \i == 1
+ pshufd $0x4e, MI, TMP2
+ pclmulqdq $0x00, TMP1, MI
+ pxor TMP2, GHASH_ACC
+ pxor MI, GHASH_ACC
+.endif
+.endm
+
+// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
+//
+// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
+// related fields in the key struct.
+.macro _aes_gcm_precompute
+
+ // Function arguments
+ .set KEY, %rdi
+
+ // Additional local variables.
+ // %xmm0-%xmm1 and %rax are used as temporaries.
+ .set RNDKEYLAST_PTR, %rsi
+ .set H_CUR, %xmm2
+ .set H_POW1, %xmm3 // H^1
+ .set H_POW1_X64, %xmm4 // H^1 * x^64
+ .set GFPOLY, %xmm5
+
+ // Encrypt an all-zeroes block to get the raw hash subkey.
+ movl OFFSETOF_AESKEYLEN(KEY), %eax
+ lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR
+ movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block
+ lea 16(KEY), %rax
+1:
+ aesenc (%rax), H_POW1
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+ aesenclast (RNDKEYLAST_PTR), H_POW1
+
+ // Preprocess the raw hash subkey as needed to operate on GHASH's
+ // bit-reflected values directly: reflect its bytes, then multiply it by
+ // x^-1 (using the backwards interpretation of polynomial coefficients
+ // from the GCM spec) or equivalently x^1 (using the alternative,
+ // natural interpretation of polynomial coefficients).
+ pshufb .Lbswap_mask(%rip), H_POW1
+ movdqa H_POW1, %xmm0
+ pshufd $0xd3, %xmm0, %xmm0
+ psrad $31, %xmm0
+ paddq H_POW1, H_POW1
+ pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0
+ pxor %xmm0, H_POW1
+
+ // Store H^1.
+ movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY)
+
+ // Compute and store H^1 * x^64.
+ movq .Lgfpoly(%rip), GFPOLY
+ pshufd $0x4e, H_POW1, %xmm0
+ _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64
+ pxor %xmm0, H_POW1_X64
+ movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)
+
+ // Compute and store the halves of H^1 XOR'd together.
+ pxor H_POW1, %xmm0
+ movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
+
+ // Compute and store the remaining key powers H^2 through H^8.
+ movdqa H_POW1, H_CUR
+ mov $6*8, %eax
+.Lprecompute_next\@:
+ // Compute H^i = H^{i-1} * H^1.
+ _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
+ // Store H^i.
+ movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
+ // Compute and store the halves of H^i XOR'd together.
+ pshufd $0x4e, H_CUR, %xmm0
+ pxor H_CUR, %xmm0
+ movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
+ sub $8, %eax
+ jge .Lprecompute_next\@
+
+ RET
+.endm
+
+// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
+// u8 ghash_acc[16], const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all
+// zeroes. |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length. The caller must do any buffering needed to ensure this.
+.macro _aes_gcm_aad_update
+
+ // Function arguments
+ .set KEY, %rdi
+ .set GHASH_ACC_PTR, %rsi
+ .set AAD, %rdx
+ .set AADLEN, %ecx
+ // Note: _load_partial_block relies on AADLEN being in %ecx.
+
+ // Additional local variables.
+ // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
+ .set BSWAP_MASK, %xmm2
+ .set GHASH_ACC, %xmm3
+ .set H_POW1, %xmm4 // H^1
+ .set H_POW1_X64, %xmm5 // H^1 * x^64
+ .set GFPOLY, %xmm6
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+ movdqu (GHASH_ACC_PTR), GHASH_ACC
+ movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
+ movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
+ movq .Lgfpoly(%rip), GFPOLY
+
+ // Process the AAD one full block at a time.
+ sub $16, AADLEN
+ jl .Laad_loop_1x_done\@
+.Laad_loop_1x\@:
+ movdqu (AAD), %xmm0
+ pshufb BSWAP_MASK, %xmm0
+ pxor %xmm0, GHASH_ACC
+ _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
+ add $16, AAD
+ sub $16, AADLEN
+ jge .Laad_loop_1x\@
+.Laad_loop_1x_done\@:
+ // Check whether there is a partial block at the end.
+ add $16, AADLEN
+ jz .Laad_done\@
+
+ // Process a partial block of length 1 <= AADLEN <= 15.
+ // _load_partial_block assumes that %ecx contains AADLEN.
+ _load_partial_block AAD, %xmm0, %r10, %r10d
+ pshufb BSWAP_MASK, %xmm0
+ pxor %xmm0, GHASH_ACC
+ _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
+
+.Laad_done\@:
+ movdqu GHASH_ACC, (GHASH_ACC_PTR)
+ RET
+.endm
+
+// Increment LE_CTR eight times to generate eight little-endian counter blocks,
+// swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with
+// the zero-th AES round key. Clobbers TMP0 and TMP1.
+.macro _ctr_begin_8x
+ movq .Lone(%rip), TMP0
+ movdqa (KEY), TMP1 // zero-th round key
+.irp i, 0,1,2,3,4,5,6,7
+ _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i
+ pxor TMP1, AESDATA\i
+ paddd TMP0, LE_CTR
+.endr
+.endm
+
+// Do a non-last round of AES on AESDATA[0-7] using \round_key.
+.macro _aesenc_8x round_key
+.irp i, 0,1,2,3,4,5,6,7
+ aesenc \round_key, AESDATA\i
+.endr
+.endm
+
+// Do the last round of AES on AESDATA[0-7] using \round_key.
+.macro _aesenclast_8x round_key
+.irp i, 0,1,2,3,4,5,6,7
+ aesenclast \round_key, AESDATA\i
+.endr
+.endm
+
+// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
+// store the result to DST. Clobbers TMP0.
+.macro _xor_data_8x
+.irp i, 0,1,2,3,4,5,6,7
+ _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0
+.endr
+.irp i, 0,1,2,3,4,5,6,7
+ movdqu AESDATA\i, \i*16(DST)
+.endr
+.endm
+
+// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one).
+//
+// This function computes the next portion of the CTR keystream, XOR's it with
+// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
+// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the
+// next |datalen| ciphertext bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length. The caller must do any buffering needed to ensure this. Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format. For a new
+// message, the low word of the counter must be 2. This function loads the
+// counter from |le_ctr| and increments the loaded counter as needed, but it
+// does *not* store the updated counter back to |le_ctr|. The caller must
+// update |le_ctr| if any more data segments follow. Internally, only the low
+// 32-bit word of the counter is incremented, following the GCM standard.
+.macro _aes_gcm_update enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg
+ .set GHASH_ACC_PTR, %rdx
+ .set SRC, %rcx
+ .set DST, %r8
+ .set DATALEN, %r9d
+ .set DATALEN64, %r9 // Zero-extend DATALEN before using!
+ // Note: the code setting up for _load_partial_block assumes that SRC is
+ // in %rcx (and that DATALEN is *not* in %rcx).
+
+ // Additional local variables
+
+ // %rax and %rsi are used as temporary registers. Note: %rsi overlaps
+ // with LE_CTR_PTR, which is used only at the beginning.
+
+ .set AESKEYLEN, %r10d // AES key length in bytes
+ .set AESKEYLEN64, %r10
+ .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key
+
+ // Put the most frequently used values in %xmm0-%xmm7 to reduce code
+ // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
+ .set TMP0, %xmm0
+ .set TMP1, %xmm1
+ .set TMP2, %xmm2
+ .set LO, %xmm3 // Low part of unreduced product
+ .set MI, %xmm4 // Middle part of unreduced product
+ .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also
+ // the high part of unreduced product
+ .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes
+ .set LE_CTR, %xmm7 // Little-endian counter value
+ .set AESDATA0, %xmm8
+ .set AESDATA1, %xmm9
+ .set AESDATA2, %xmm10
+ .set AESDATA3, %xmm11
+ .set AESDATA4, %xmm12
+ .set AESDATA5, %xmm13
+ .set AESDATA6, %xmm14
+ .set AESDATA7, %xmm15
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+ movdqu (GHASH_ACC_PTR), GHASH_ACC
+ movdqu (LE_CTR_PTR), LE_CTR
+
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+ lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+
+ // If there are at least 8*16 bytes of data, then continue into the main
+ // loop, which processes 8*16 bytes of data per iteration.
+ //
+ // The main loop interleaves AES and GHASH to improve performance on
+ // CPUs that can execute these instructions in parallel. When
+ // decrypting, the GHASH input (the ciphertext) is immediately
+ // available. When encrypting, we instead encrypt a set of 8 blocks
+ // first and then GHASH those blocks while encrypting the next set of 8,
+ // repeat that as needed, and finally GHASH the last set of 8 blocks.
+ //
+ // Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
+ // as this makes the immediate fit in a signed byte, saving 3 bytes.
+ add $-8*16, DATALEN
+ jl .Lcrypt_loop_8x_done\@
+.if \enc
+ // Encrypt the first 8 plaintext blocks.
+ _ctr_begin_8x
+ lea 16(KEY), %rsi
+ .p2align 4
+1:
+ movdqa (%rsi), TMP0
+ _aesenc_8x TMP0
+ add $16, %rsi
+ cmp %rsi, RNDKEYLAST_PTR
+ jne 1b
+ movdqa (%rsi), TMP0
+ _aesenclast_8x TMP0
+ _xor_data_8x
+ // Don't increment DST until the ciphertext blocks have been hashed.
+ sub $-8*16, SRC
+ add $-8*16, DATALEN
+ jl .Lghash_last_ciphertext_8x\@
+.endif
+
+ .p2align 4
+.Lcrypt_loop_8x\@:
+
+ // Generate the next set of 8 counter blocks and start encrypting them.
+ _ctr_begin_8x
+ lea 16(KEY), %rsi
+
+ // Do a round of AES, and start the GHASH update of 8 ciphertext blocks
+ // by doing the unreduced multiplication for the first ciphertext block.
+ movdqa (%rsi), TMP0
+ add $16, %rsi
+ _aesenc_8x TMP0
+ _ghash_update_begin_8x \enc
+
+ // Do 7 more rounds of AES, and continue the GHASH update by doing the
+ // unreduced multiplication for the remaining ciphertext blocks.
+ .p2align 4
+1:
+ movdqa (%rsi), TMP0
+ add $16, %rsi
+ _aesenc_8x TMP0
+ _ghash_update_continue_8x \enc
+ cmp $7*8, %eax
+ jne 1b
+
+ // Do the remaining AES rounds.
+ .p2align 4
+1:
+ movdqa (%rsi), TMP0
+ add $16, %rsi
+ _aesenc_8x TMP0
+ cmp %rsi, RNDKEYLAST_PTR
+ jne 1b
+
+ // Do the GHASH reduction and the last round of AES.
+ movdqa (RNDKEYLAST_PTR), TMP0
+ _ghash_update_end_8x_step 0
+ _aesenclast_8x TMP0
+ _ghash_update_end_8x_step 1
+
+ // XOR the data with the AES-CTR keystream blocks.
+.if \enc
+ sub $-8*16, DST
+.endif
+ _xor_data_8x
+ sub $-8*16, SRC
+.if !\enc
+ sub $-8*16, DST
+.endif
+ add $-8*16, DATALEN
+ jge .Lcrypt_loop_8x\@
+
+.if \enc
+.Lghash_last_ciphertext_8x\@:
+ // Update GHASH with the last set of 8 ciphertext blocks.
+ _ghash_update_begin_8x \enc
+ .p2align 4
+1:
+ _ghash_update_continue_8x \enc
+ cmp $7*8, %eax
+ jne 1b
+ _ghash_update_end_8x_step 0
+ _ghash_update_end_8x_step 1
+ sub $-8*16, DST
+.endif
+
+.Lcrypt_loop_8x_done\@:
+
+ sub $-8*16, DATALEN
+ jz .Ldone\@
+
+ // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep
+ // things simple and keep the code size down by just going one block at
+ // a time, again taking advantage of hardware loop unrolling. Since
+ // there are enough key powers available for all remaining data, we do
+ // the GHASH multiplications unreduced, and only reduce at the very end.
+
+ .set HI, TMP2
+ .set H_POW, AESDATA0
+ .set H_POW_XORED, AESDATA1
+ .set ONE, AESDATA2
+
+ movq .Lone(%rip), ONE
+
+ // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+ pxor LO, LO
+ pxor MI, MI
+ pxor HI, HI
+
+ // Set up a block counter %rax to contain 8*(8-n), where n is the number
+ // of blocks that remain, counting any partial block. This will be used
+ // to access the key powers H^n through H^1.
+ mov DATALEN, %eax
+ neg %eax
+ and $~15, %eax
+ sar $1, %eax
+ add $64, %eax
+
+ sub $16, DATALEN
+ jl .Lcrypt_loop_1x_done\@
+
+ // Process the data one full block at a time.
+.Lcrypt_loop_1x\@:
+
+ // Encrypt the next counter block.
+ _vpshufb BSWAP_MASK, LE_CTR, TMP0
+ paddd ONE, LE_CTR
+ pxor (KEY), TMP0
+ lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ aesenc -7*16(%rsi), TMP0
+ aesenc -6*16(%rsi), TMP0
+192:
+ aesenc -5*16(%rsi), TMP0
+ aesenc -4*16(%rsi), TMP0
+128:
+.irp i, -3,-2,-1,0,1,2,3,4,5
+ aesenc \i*16(%rsi), TMP0
+.endr
+ aesenclast (RNDKEYLAST_PTR), TMP0
+
+ // Load the next key power H^i.
+ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
+ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
+
+ // XOR the keystream block that was just generated in TMP0 with the next
+ // source data block and store the resulting en/decrypted data to DST.
+.if \enc
+ _xor_mem_to_reg (SRC), TMP0, tmp=TMP1
+ movdqu TMP0, (DST)
+.else
+ movdqu (SRC), TMP1
+ pxor TMP1, TMP0
+ movdqu TMP0, (DST)
+.endif
+
+ // Update GHASH with the ciphertext block.
+.if \enc
+ pshufb BSWAP_MASK, TMP0
+ pxor TMP0, GHASH_ACC
+.else
+ pshufb BSWAP_MASK, TMP1
+ pxor TMP1, GHASH_ACC
+.endif
+ _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
+ pxor GHASH_ACC, GHASH_ACC
+
+ add $8, %eax
+ add $16, SRC
+ add $16, DST
+ sub $16, DATALEN
+ jge .Lcrypt_loop_1x\@
+.Lcrypt_loop_1x_done\@:
+ // Check whether there is a partial block at the end.
+ add $16, DATALEN
+ jz .Lghash_reduce\@
+
+ // Process a partial block of length 1 <= DATALEN <= 15.
+
+ // Encrypt a counter block for the last time.
+ pshufb BSWAP_MASK, LE_CTR
+ pxor (KEY), LE_CTR
+ lea 16(KEY), %rsi
+1:
+ aesenc (%rsi), LE_CTR
+ add $16, %rsi
+ cmp %rsi, RNDKEYLAST_PTR
+ jne 1b
+ aesenclast (RNDKEYLAST_PTR), LE_CTR
+
+ // Load the lowest key power, H^1.
+ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
+ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
+
+ // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is
+ // in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
+ // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
+ mov SRC, RNDKEYLAST_PTR
+ mov DATALEN, %ecx
+ _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi
+
+ // XOR the keystream block that was just generated in LE_CTR with the
+ // source data block and store the resulting en/decrypted data to DST.
+ pxor TMP0, LE_CTR
+ mov DATALEN, %ecx
+ _store_partial_block LE_CTR, DST
+
+ // If encrypting, zero-pad the final ciphertext block for GHASH. (If
+ // decrypting, this was already done by _load_partial_block.)
+.if \enc
+ lea .Lzeropad_mask+16(%rip), %rax
+ sub DATALEN64, %rax
+ _vpand (%rax), LE_CTR, TMP0
+.endif
+
+ // Update GHASH with the final ciphertext block.
+ pshufb BSWAP_MASK, TMP0
+ pxor TMP0, GHASH_ACC
+ _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
+
+.Lghash_reduce\@:
+ // Finally, do the GHASH reduction.
+ _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0
+
+.Ldone\@:
+ // Store the updated GHASH accumulator back to memory.
+ movdqu GHASH_ACC, (GHASH_ACC_PTR)
+
+ RET
+.endm
+
+// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
+// const u32 le_ctr[4], const u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen,
+// const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one). Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|. The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro _aes_gcm_final enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set GHASH_ACC_PTR, %rdx
+ .set TOTAL_AADLEN, %rcx
+ .set TOTAL_DATALEN, %r8
+ .set TAG, %r9
+ .set TAGLEN, %r10d // Originally at 8(%rsp)
+ .set TAGLEN64, %r10
+
+ // Additional local variables.
+ // %rax and %xmm0-%xmm2 are used as temporary registers.
+ .set AESKEYLEN, %r11d
+ .set AESKEYLEN64, %r11
+ .set BSWAP_MASK, %xmm3
+ .set GHASH_ACC, %xmm4
+ .set H_POW1, %xmm5 // H^1
+ .set H_POW1_X64, %xmm6 // H^1 * x^64
+ .set GFPOLY, %xmm7
+
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Set up a counter block with 1 in the low 32-bit word. This is the
+ // counter that produces the ciphertext needed to encrypt the auth tag.
+ movdqu (LE_CTR_PTR), %xmm0
+ mov $1, %eax
+ pinsrd $0, %eax, %xmm0
+
+ // Build the lengths block and XOR it into the GHASH accumulator.
+ movq TOTAL_DATALEN, GHASH_ACC
+ pinsrq $1, TOTAL_AADLEN, GHASH_ACC
+ psllq $3, GHASH_ACC // Bytes to bits
+ _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1
+
+ movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
+ movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
+ movq .Lgfpoly(%rip), GFPOLY
+
+ // Make %rax point to the 6th from last AES round key. (Using signed
+ // byte offsets -7*16 through 6*16 decreases code size.)
+ lea (KEY,AESKEYLEN64,4), %rax
+
+ // AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
+ // Interleave the AES and GHASH instructions to improve performance.
+ pshufb BSWAP_MASK, %xmm0
+ pxor (KEY), %xmm0
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ aesenc -7*16(%rax), %xmm0
+ aesenc -6*16(%rax), %xmm0
+192:
+ aesenc -5*16(%rax), %xmm0
+ aesenc -4*16(%rax), %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+ aesenc (\i-3)*16(%rax), %xmm0
+ _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
+.endr
+ aesenclast 6*16(%rax), %xmm0
+ _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
+
+ // Undo the byte reflection of the GHASH accumulator.
+ pshufb BSWAP_MASK, GHASH_ACC
+
+ // Encrypt the GHASH accumulator.
+ pxor %xmm0, GHASH_ACC
+
+.if \enc
+ // Return the computed auth tag.
+ movdqu GHASH_ACC, (GHASH_ACC_PTR)
+.else
+ .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!
+
+ // Verify the auth tag in constant time by XOR'ing the transmitted and
+ // computed auth tags together and using the ptest instruction to check
+ // whether the first TAGLEN bytes of the result are zero.
+ _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0
+ movl 8(%rsp), TAGLEN
+ lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
+ sub TAGLEN64, ZEROPAD_MASK_PTR
+ xor %eax, %eax
+ _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
+ sete %al
+.endif
+ RET
+.endm
+
+.set USE_AVX, 0
+SYM_FUNC_START(aes_gcm_precompute_aesni)
+ _aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_aesni)
+SYM_FUNC_START(aes_gcm_aad_update_aesni)
+ _aes_gcm_aad_update
+SYM_FUNC_END(aes_gcm_aad_update_aesni)
+SYM_FUNC_START(aes_gcm_enc_update_aesni)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_aesni)
+SYM_FUNC_START(aes_gcm_dec_update_aesni)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_aesni)
+SYM_FUNC_START(aes_gcm_enc_final_aesni)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_aesni)
+SYM_FUNC_START(aes_gcm_dec_final_aesni)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_aesni)
+
+.set USE_AVX, 1
+SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
+ _aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
+SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
+ _aes_gcm_aad_update
+SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
+SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)
diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
new file mode 100644
index 000000000000..97e0ee515fc5
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
@@ -0,0 +1,1222 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// VAES and VPCLMULQDQ optimized AES-GCM for x86_64
+//
+// Copyright 2024 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
+// of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+//
+// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
+// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and
+// either AVX512 or AVX10. Some of the functions, notably the encryption and
+// decryption update functions which are the most performance-critical, are
+// provided in two variants generated from a macro: one using 256-bit vectors
+// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The
+// other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
+//
+// The functions that use 512-bit vectors are intended for CPUs that support
+// 512-bit vectors *and* where using them doesn't cause significant
+// downclocking. They require the following CPU features:
+//
+// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512)
+//
+// The other functions require the following CPU features:
+//
+// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256)
+//
+// All functions use the "System V" ABI. The Windows ABI is not supported.
+//
+// Note that we use "avx10" in the names of the functions as a shorthand to
+// really mean "AVX10 or a certain set of AVX512 features". Due to Intel's
+// introduction of AVX512 and then its replacement by AVX10, there doesn't seem
+// to be a simple way to name things that makes sense on all CPUs.
+//
+// Note that the macros that support both 256-bit and 512-bit vectors could
+// fairly easily be changed to support 128-bit too. However, this would *not*
+// be sufficient to allow the code to run on CPUs without AVX512 or AVX10,
+// because the code heavily uses several features of these extensions other than
+// the vector length: the increase in the number of SIMD registers from 16 to
+// 32, masking support, and new instructions such as vpternlogd (which can do a
+// three-argument XOR). These features are very useful for AES-GCM.
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 6
+
+ // A shuffle mask that reflects the bytes of 16-byte blocks
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+
+ // This is the GHASH reducing polynomial without its constant term, i.e.
+ // x^128 + x^7 + x^2 + x, represented using the backwards mapping
+ // between bits and polynomial coefficients.
+ //
+ // Alternatively, it can be interpreted as the naturally-ordered
+ // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+ // "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+ .octa 0xc2000000000000000000000000000001
+
+ // Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+ .octa 0xc2000000000000010000000000000001
+
+ // The below constants are used for incrementing the counter blocks.
+ // ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
+ // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
+ // 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks.
+.Lctr_pattern:
+ .octa 0
+ .octa 1
+.Linc_2blocks:
+ .octa 2
+ .octa 3
+.Linc_4blocks:
+ .octa 4
+
+// Number of powers of the hash key stored in the key struct. The powers are
+// stored from highest (H^NUM_H_POWERS) to lowest (H^1).
+#define NUM_H_POWERS 16
+
+// Offset to AES key length (in bytes) in the key struct
+#define OFFSETOF_AESKEYLEN 480
+
+// Offset to start of hash key powers array in the key struct
+#define OFFSETOF_H_POWERS 512
+
+// Offset to end of hash key powers array in the key struct.
+//
+// This is immediately followed by three zeroized padding blocks, which are
+// included so that partial vectors can be handled more easily. E.g. if VL=64
+// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most
+// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
+#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
+
+.text
+
+// Set the vector length in bytes. This sets the VL variable and defines
+// register aliases V0-V31 that map to the ymm or zmm registers.
+.macro _set_veclen vl
+ .set VL, \vl
+.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+ 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+.if VL == 32
+ .set V\i, %ymm\i
+.elseif VL == 64
+ .set V\i, %zmm\i
+.else
+ .error "Unsupported vector length"
+.endif
+.endr
+.endm
+
+// The _ghash_mul_step macro does one step of GHASH multiplication of the
+// 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
+// reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the
+// same size as \a and \b. To complete all steps, this must invoked with \i=0
+// through \i=9. The division into steps allows users of this macro to
+// optionally interleave the computation with other instructions. Users of this
+// macro must preserve the parameter registers across steps.
+//
+// The multiplications are done in GHASH's representation of the finite field
+// GF(2^128). Elements of GF(2^128) are represented as binary polynomials
+// (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial
+// G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is
+// just XOR, while multiplication is more complex and has two parts: (a) do
+// carryless multiplication of two 128-bit input polynomials to get a 256-bit
+// intermediate product polynomial, and (b) reduce the intermediate product to
+// 128 bits by adding multiples of G that cancel out terms in it. (Adding
+// multiples of G doesn't change which field element the polynomial represents.)
+//
+// Unfortunately, the GCM specification maps bits to/from polynomial
+// coefficients backwards from the natural order. In each byte it specifies the
+// highest bit to be the lowest order polynomial coefficient, *not* the highest!
+// This makes it nontrivial to work with the GHASH polynomials. We could
+// reflect the bits, but x86 doesn't have an instruction that does that.
+//
+// Instead, we operate on the values without bit-reflecting them. This *mostly*
+// just works, since XOR and carryless multiplication are symmetric with respect
+// to bit order, but it has some consequences. First, due to GHASH's byte
+// order, by skipping bit reflection, *byte* reflection becomes necessary to
+// give the polynomial terms a consistent order. E.g., considering an N-bit
+// value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0
+// through N-1 of the byte-reflected value represent the coefficients of x^(N-1)
+// through x^0, whereas bits 0 through N-1 of the non-byte-reflected value
+// represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked
+// with. Fortunately, x86's vpshufb instruction can do byte reflection.
+//
+// Second, forgoing the bit reflection causes an extra multiple of x (still
+// using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each
+// multiplication. This is because an M-bit by N-bit carryless multiplication
+// really produces a (M+N-1)-bit product, but in practice it's zero-extended to
+// M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits
+// to polynomial coefficients backwards, this zero-extension actually changes
+// the product by introducing an extra factor of x. Therefore, users of this
+// macro must ensure that one of the inputs has an extra factor of x^-1, i.e.
+// the multiplicative inverse of x, to cancel out the extra x.
+//
+// Third, the backwards coefficients convention is just confusing to work with,
+// since it makes "low" and "high" in the polynomial math mean the opposite of
+// their normal meaning in computer programming. This can be solved by using an
+// alternative interpretation: the polynomial coefficients are understood to be
+// in the natural order, and the multiplication is actually \a * \b * x^-128 mod
+// x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs,
+// or the implementation at all; it just changes the mathematical interpretation
+// of what each instruction is doing. Starting from here, we'll use this
+// alternative interpretation, as it's easier to understand the code that way.
+//
+// Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 =>
+// 128-bit carryless multiplication, so we break the 128 x 128 multiplication
+// into parts as follows (the _L and _H suffixes denote low and high 64 bits):
+//
+// LO = a_L * b_L
+// MI = (a_L * b_H) + (a_H * b_L)
+// HI = a_H * b_H
+//
+// The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit.
+// Note that MI "overlaps" with LO and HI. We don't consolidate MI into LO and
+// HI right away, since the way the reduction works makes that unnecessary.
+//
+// For the reduction, we cancel out the low 128 bits by adding multiples of G =
+// x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of
+// which cancels out the next lowest 64 bits. Consider a value x^64*A + B,
+// where A and B are 128-bit. Adding B_L*G to that value gives:
+//
+// x^64*A + B + B_L*G
+// = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1)
+// = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L
+// = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L
+// = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57))
+//
+// So: if we sum A, B with its halves swapped, and the low half of B times x^63
+// + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the
+// original value x^64*A + B. I.e., the low 64 bits got canceled out.
+//
+// We just need to apply this twice: first to fold LO into MI, and second to
+// fold the updated MI into HI.
+//
+// The needed three-argument XORs are done using the vpternlogd instruction with
+// immediate 0x96, since this is faster than two vpxord instructions.
+//
+// A potential optimization, assuming that b is fixed per-key (if a is fixed
+// per-key it would work the other way around), is to use one iteration of the
+// reduction described above to precompute a value c such that x^64*c = b mod G,
+// and then multiply a_L by c (and implicitly by x^64) instead of by b:
+//
+// MI = (a_L * c_L) + (a_H * b_L)
+// HI = (a_L * c_H) + (a_H * b_H)
+//
+// This would eliminate the LO part of the intermediate product, which would
+// eliminate the need to fold LO into MI. This would save two instructions,
+// including a vpclmulqdq. However, we currently don't use this optimization
+// because it would require twice as many per-key precomputed values.
+//
+// Using Karatsuba multiplication instead of "schoolbook" multiplication
+// similarly would save a vpclmulqdq but does not seem to be worth it.
+.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2
+.if \i == 0
+ vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L
+ vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H
+.elseif \i == 1
+ vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L
+.elseif \i == 2
+ vpxord \t2, \t1, \t1 // MI = MI_0 + MI_1
+.elseif \i == 3
+ vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57)
+.elseif \i == 4
+ vpshufd $0x4e, \t0, \t0 // Swap halves of LO
+.elseif \i == 5
+ vpternlogd $0x96, \t2, \t0, \t1 // Fold LO into MI
+.elseif \i == 6
+ vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H
+.elseif \i == 7
+ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+ vpshufd $0x4e, \t1, \t1 // Swap halves of MI
+.elseif \i == 9
+ vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI
+.endif
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+// the reduced products in \dst. See _ghash_mul_step for full explanation.
+.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2
+.endr
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
+// *unreduced* products to \lo, \mi, and \hi.
+.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0, t1, t2, t3
+ vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L
+ vpclmulqdq $0x01, \a, \b, \t1 // a_L * b_H
+ vpclmulqdq $0x10, \a, \b, \t2 // a_H * b_L
+ vpclmulqdq $0x11, \a, \b, \t3 // a_H * b_H
+ vpxord \t0, \lo, \lo
+ vpternlogd $0x96, \t2, \t1, \mi
+ vpxord \t3, \hi, \hi
+.endm
+
+// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
+// reduced products in \hi. See _ghash_mul_step for explanation of reduction.
+.macro _ghash_reduce lo, mi, hi, gfpoly, t0
+ vpclmulqdq $0x01, \lo, \gfpoly, \t0
+ vpshufd $0x4e, \lo, \lo
+ vpternlogd $0x96, \t0, \lo, \mi
+ vpclmulqdq $0x01, \mi, \gfpoly, \t0
+ vpshufd $0x4e, \mi, \mi
+ vpternlogd $0x96, \t0, \mi, \hi
+.endm
+
+// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key);
+//
+// Given the expanded AES key |key->aes_key|, this function derives the GHASH
+// subkey and initializes |key->ghash_key_powers| with powers of it.
+//
+// The number of key powers initialized is NUM_H_POWERS, and they are stored in
+// the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key
+// powers themselves are also initialized.
+//
+// This macro supports both VL=32 and VL=64. _set_veclen must have been invoked
+// with the desired length. In the VL=32 case, the function computes twice as
+// many key powers than are actually used by the VL=32 GCM update functions.
+// This is done to keep the key format the same regardless of vector length.
+.macro _aes_gcm_precompute
+
+ // Function arguments
+ .set KEY, %rdi
+
+ // Additional local variables. V0-V2 and %rax are used as temporaries.
+ .set POWERS_PTR, %rsi
+ .set RNDKEYLAST_PTR, %rdx
+ .set H_CUR, V3
+ .set H_CUR_YMM, %ymm3
+ .set H_CUR_XMM, %xmm3
+ .set H_INC, V4
+ .set H_INC_YMM, %ymm4
+ .set H_INC_XMM, %xmm4
+ .set GFPOLY, V5
+ .set GFPOLY_YMM, %ymm5
+ .set GFPOLY_XMM, %xmm5
+
+ // Get pointer to lowest set of key powers (located at end of array).
+ lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR
+
+ // Encrypt an all-zeroes block to get the raw hash subkey.
+ movl OFFSETOF_AESKEYLEN(KEY), %eax
+ lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR
+ vmovdqu (KEY), %xmm0 // Zero-th round key XOR all-zeroes block
+ add $16, KEY
+1:
+ vaesenc (KEY), %xmm0, %xmm0
+ add $16, KEY
+ cmp KEY, RNDKEYLAST_PTR
+ jne 1b
+ vaesenclast (RNDKEYLAST_PTR), %xmm0, %xmm0
+
+ // Reflect the bytes of the raw hash subkey.
+ vpshufb .Lbswap_mask(%rip), %xmm0, H_CUR_XMM
+
+ // Zeroize the padding blocks.
+ vpxor %xmm0, %xmm0, %xmm0
+ vmovdqu %ymm0, VL(POWERS_PTR)
+ vmovdqu %xmm0, VL+2*16(POWERS_PTR)
+
+ // Finish preprocessing the first key power, H^1. Since this GHASH
+ // implementation operates directly on values with the backwards bit
+ // order specified by the GCM standard, it's necessary to preprocess the
+ // raw key as follows. First, reflect its bytes. Second, multiply it
+ // by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards
+ // interpretation of polynomial coefficients), which can also be
+ // interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121
+ // + 1 using the alternative, natural interpretation of polynomial
+ // coefficients. For details, see the comment above _ghash_mul_step.
+ //
+ // Either way, for the multiplication the concrete operation performed
+ // is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
+ // << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit
+ // wide shift instruction, so instead double each of the two 64-bit
+ // halves and incorporate the internal carry bit into the value XOR'd.
+ vpshufd $0xd3, H_CUR_XMM, %xmm0
+ vpsrad $31, %xmm0, %xmm0
+ vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM
+ vpand .Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0
+ vpxor %xmm0, H_CUR_XMM, H_CUR_XMM
+
+ // Load the gfpoly constant.
+ vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY
+
+ // Square H^1 to get H^2.
+ //
+ // Note that as with H^1, all higher key powers also need an extra
+ // factor of x^-1 (or x using the natural interpretation). Nothing
+ // special needs to be done to make this happen, though: H^1 * H^1 would
+ // end up with two factors of x^-1, but the multiplication consumes one.
+ // So the product H^2 ends up with the desired one factor of x^-1.
+ _ghash_mul H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \
+ %xmm0, %xmm1, %xmm2
+
+ // Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
+ vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM
+ vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM
+
+.if VL == 64
+ // Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
+ _ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \
+ %ymm0, %ymm1, %ymm2
+ vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR
+ vshufi64x2 $0, H_INC, H_INC, H_INC
+.endif
+
+ // Store the lowest set of key powers.
+ vmovdqu8 H_CUR, (POWERS_PTR)
+
+ // Compute and store the remaining key powers. With VL=32, repeatedly
+ // multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)].
+ // With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
+ // [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
+ mov $(NUM_H_POWERS*16/VL) - 1, %eax
+.Lprecompute_next\@:
+ sub $VL, POWERS_PTR
+ _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2
+ vmovdqu8 H_CUR, (POWERS_PTR)
+ dec %eax
+ jnz .Lprecompute_next\@
+
+ vzeroupper // This is needed after using ymm or zmm registers.
+ RET
+.endm
+
+// XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
+// the result in \dst_xmm. This implicitly zeroizes the other lanes of dst.
+.macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm
+ vextracti32x4 $1, \src, \t0_xmm
+.if VL == 32
+ vpxord \t0_xmm, \src_xmm, \dst_xmm
+.elseif VL == 64
+ vextracti32x4 $2, \src, \t1_xmm
+ vextracti32x4 $3, \src, \t2_xmm
+ vpxord \t0_xmm, \src_xmm, \dst_xmm
+ vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm
+.else
+ .error "Unsupported vector length"
+.endif
+.endm
+
+// Do one step of the GHASH update of the data blocks given in the vector
+// registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The
+// division into steps allows users of this macro to optionally interleave the
+// computation with other instructions. This macro uses the vector register
+// GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered;
+// H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and
+// GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the
+// data blocks. The parameter registers must be preserved across steps.
+//
+// The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
+// H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
+// operations are vectorized operations on vectors of 16-byte blocks. E.g.,
+// with VL=32 there are 2 blocks per vector and the vectorized terms correspond
+// to the following non-vectorized terms:
+//
+// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0)
+// H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3
+// H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5
+// H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7
+//
+// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
+//
+// More concretely, this code does:
+// - Do vectorized "schoolbook" multiplications to compute the intermediate
+// 256-bit product of each block and its corresponding hash key power.
+// There are 4*VL/16 of these intermediate products.
+// - Sum (XOR) the intermediate 256-bit products across vectors. This leaves
+// VL/16 256-bit intermediate values.
+// - Do a vectorized reduction of these 256-bit intermediate values to
+// 128-bits each. This leaves VL/16 128-bit intermediate values.
+// - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
+//
+// See _ghash_mul_step for the full explanation of the operations performed for
+// each individual finite field multiplication and reduction.
+.macro _ghash_step_4x i
+.if \i == 0
+ vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0
+ vpxord GHASH_ACC, GHASHDATA0, GHASHDATA0
+ vpshufb BSWAP_MASK, GHASHDATA1, GHASHDATA1
+ vpshufb BSWAP_MASK, GHASHDATA2, GHASHDATA2
+.elseif \i == 1
+ vpshufb BSWAP_MASK, GHASHDATA3, GHASHDATA3
+ vpclmulqdq $0x00, H_POW4, GHASHDATA0, GHASH_ACC // LO_0
+ vpclmulqdq $0x00, H_POW3, GHASHDATA1, GHASHTMP0 // LO_1
+ vpclmulqdq $0x00, H_POW2, GHASHDATA2, GHASHTMP1 // LO_2
+.elseif \i == 2
+ vpxord GHASHTMP0, GHASH_ACC, GHASH_ACC // sum(LO_{1,0})
+ vpclmulqdq $0x00, H_POW1, GHASHDATA3, GHASHTMP2 // LO_3
+ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASH_ACC // LO = sum(LO_{3,2,1,0})
+ vpclmulqdq $0x01, H_POW4, GHASHDATA0, GHASHTMP0 // MI_0
+.elseif \i == 3
+ vpclmulqdq $0x01, H_POW3, GHASHDATA1, GHASHTMP1 // MI_1
+ vpclmulqdq $0x01, H_POW2, GHASHDATA2, GHASHTMP2 // MI_2
+ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{2,1,0})
+ vpclmulqdq $0x01, H_POW1, GHASHDATA3, GHASHTMP1 // MI_3
+.elseif \i == 4
+ vpclmulqdq $0x10, H_POW4, GHASHDATA0, GHASHTMP2 // MI_4
+ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{4,3,2,1,0})
+ vpclmulqdq $0x10, H_POW3, GHASHDATA1, GHASHTMP1 // MI_5
+ vpclmulqdq $0x10, H_POW2, GHASHDATA2, GHASHTMP2 // MI_6
+.elseif \i == 5
+ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{6,5,4,3,2,1,0})
+ vpclmulqdq $0x01, GHASH_ACC, GFPOLY, GHASHTMP2 // LO_L*(x^63 + x^62 + x^57)
+ vpclmulqdq $0x10, H_POW1, GHASHDATA3, GHASHTMP1 // MI_7
+ vpxord GHASHTMP1, GHASHTMP0, GHASHTMP0 // MI = sum(MI_{7,6,5,4,3,2,1,0})
+.elseif \i == 6
+ vpshufd $0x4e, GHASH_ACC, GHASH_ACC // Swap halves of LO
+ vpclmulqdq $0x11, H_POW4, GHASHDATA0, GHASHDATA0 // HI_0
+ vpclmulqdq $0x11, H_POW3, GHASHDATA1, GHASHDATA1 // HI_1
+ vpclmulqdq $0x11, H_POW2, GHASHDATA2, GHASHDATA2 // HI_2
+.elseif \i == 7
+ vpternlogd $0x96, GHASHTMP2, GHASH_ACC, GHASHTMP0 // Fold LO into MI
+ vpclmulqdq $0x11, H_POW1, GHASHDATA3, GHASHDATA3 // HI_3
+ vpternlogd $0x96, GHASHDATA2, GHASHDATA1, GHASHDATA0 // sum(HI_{2,1,0})
+ vpclmulqdq $0x01, GHASHTMP0, GFPOLY, GHASHTMP1 // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+ vpxord GHASHDATA3, GHASHDATA0, GHASH_ACC // HI = sum(HI_{3,2,1,0})
+ vpshufd $0x4e, GHASHTMP0, GHASHTMP0 // Swap halves of MI
+ vpternlogd $0x96, GHASHTMP1, GHASHTMP0, GHASH_ACC // Fold MI into HI
+.elseif \i == 9
+ _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
+ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+.endif
+.endm
+
+// Do one non-last round of AES encryption on the counter blocks in V0-V3 using
+// the round key that has been broadcast to all 128-bit lanes of \round_key.
+.macro _vaesenc_4x round_key
+ vaesenc \round_key, V0, V0
+ vaesenc \round_key, V1, V1
+ vaesenc \round_key, V2, V2
+ vaesenc \round_key, V3, V3
+.endm
+
+// Start the AES encryption of four vectors of counter blocks.
+.macro _ctr_begin_4x
+
+ // Increment LE_CTR four times to generate four vectors of little-endian
+ // counter blocks, swap each to big-endian, and store them in V0-V3.
+ vpshufb BSWAP_MASK, LE_CTR, V0
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+ vpshufb BSWAP_MASK, LE_CTR, V1
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+ vpshufb BSWAP_MASK, LE_CTR, V2
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+ vpshufb BSWAP_MASK, LE_CTR, V3
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+
+ // AES "round zero": XOR in the zero-th round key.
+ vpxord RNDKEY0, V0, V0
+ vpxord RNDKEY0, V1, V1
+ vpxord RNDKEY0, V2, V2
+ vpxord RNDKEY0, V3, V3
+.endm
+
+// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one). This macro supports both
+// VL=32 and VL=64. _set_veclen must have been invoked with the desired length.
+//
+// This function computes the next portion of the CTR keystream, XOR's it with
+// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
+// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the
+// next |datalen| ciphertext bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length. The caller must do any buffering needed to ensure this. Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format. For a new
+// message, the low word of the counter must be 2. This function loads the
+// counter from |le_ctr| and increments the loaded counter as needed, but it
+// does *not* store the updated counter back to |le_ctr|. The caller must
+// update |le_ctr| if any more data segments follow. Internally, only the low
+// 32-bit word of the counter is incremented, following the GCM standard.
+.macro _aes_gcm_update enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set GHASH_ACC_PTR, %rdx
+ .set SRC, %rcx
+ .set DST, %r8
+ .set DATALEN, %r9d
+ .set DATALEN64, %r9 // Zero-extend DATALEN before using!
+
+ // Additional local variables
+
+ // %rax and %k1 are used as temporary registers. LE_CTR_PTR is also
+ // available as a temporary register after the counter is loaded.
+
+ // AES key length in bytes
+ .set AESKEYLEN, %r10d
+ .set AESKEYLEN64, %r10
+
+ // Pointer to the last AES round key for the chosen AES variant
+ .set RNDKEYLAST_PTR, %r11
+
+ // In the main loop, V0-V3 are used as AES input and output. Elsewhere
+ // they are used as temporary registers.
+
+ // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
+ .set GHASHDATA0, V4
+ .set GHASHDATA0_XMM, %xmm4
+ .set GHASHDATA1, V5
+ .set GHASHDATA1_XMM, %xmm5
+ .set GHASHDATA2, V6
+ .set GHASHDATA2_XMM, %xmm6
+ .set GHASHDATA3, V7
+
+ // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+ // using vpshufb, copied to all 128-bit lanes.
+ .set BSWAP_MASK, V8
+
+ // RNDKEY temporarily holds the next AES round key.
+ .set RNDKEY, V9
+
+ // GHASH_ACC is the accumulator variable for GHASH. When fully reduced,
+ // only the lowest 128-bit lane can be nonzero. When not fully reduced,
+ // more than one lane may be used, and they need to be XOR'd together.
+ .set GHASH_ACC, V10
+ .set GHASH_ACC_XMM, %xmm10
+
+ // LE_CTR_INC is the vector of 32-bit words that need to be added to a
+ // vector of little-endian counter blocks to advance it forwards.
+ .set LE_CTR_INC, V11
+
+ // LE_CTR contains the next set of little-endian counter blocks.
+ .set LE_CTR, V12
+
+ // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys,
+ // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key,
+ // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
+ .set RNDKEY0, V13
+ .set RNDKEYLAST, V14
+ .set RNDKEY_M9, V15
+ .set RNDKEY_M8, V16
+ .set RNDKEY_M7, V17
+ .set RNDKEY_M6, V18
+ .set RNDKEY_M5, V19
+
+ // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with
+ // the corresponding block of source data. This is useful because
+ // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can
+ // be computed in parallel with the AES rounds.
+ .set RNDKEYLAST0, V20
+ .set RNDKEYLAST1, V21
+ .set RNDKEYLAST2, V22
+ .set RNDKEYLAST3, V23
+
+ // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These
+ // cannot coincide with anything used for AES encryption, since for
+ // performance reasons GHASH and AES encryption are interleaved.
+ .set GHASHTMP0, V24
+ .set GHASHTMP1, V25
+ .set GHASHTMP2, V26
+
+ // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The
+ // descending numbering reflects the order of the key powers.
+ .set H_POW4, V27
+ .set H_POW3, V28
+ .set H_POW2, V29
+ .set H_POW1, V30
+
+ // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
+ .set GFPOLY, V31
+
+ // Load some constants.
+ vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK
+ vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY
+
+ // Load the GHASH accumulator and the starting counter.
+ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
+ vbroadcasti32x4 (LE_CTR_PTR), LE_CTR
+
+ // Load the AES key length in bytes.
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Make RNDKEYLAST_PTR point to the last AES round key. This is the
+ // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+ // respectively. Then load the zero-th and last round keys.
+ lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+ vbroadcasti32x4 (KEY), RNDKEY0
+ vbroadcasti32x4 (RNDKEYLAST_PTR), RNDKEYLAST
+
+ // Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
+ vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR
+
+ // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
+.if VL == 32
+ vbroadcasti32x4 .Linc_2blocks(%rip), LE_CTR_INC
+.elseif VL == 64
+ vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC
+.else
+ .error "Unsupported vector length"
+.endif
+
+ // If there are at least 4*VL bytes of data, then continue into the loop
+ // that processes 4*VL bytes of data at a time. Otherwise skip it.
+ //
+ // Pre-subtracting 4*VL from DATALEN saves an instruction from the main
+ // loop and also ensures that at least one write always occurs to
+ // DATALEN, zero-extending it and allowing DATALEN64 to be used later.
+ sub $4*VL, DATALEN
+ jl .Lcrypt_loop_4x_done\@
+
+ // Load powers of the hash key.
+ vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4
+ vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3
+ vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2
+ vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1
+
+ // Main loop: en/decrypt and hash 4 vectors at a time.
+ //
+ // When possible, interleave the AES encryption of the counter blocks
+ // with the GHASH update of the ciphertext blocks. This improves
+ // performance on many CPUs because the execution ports used by the VAES
+ // instructions often differ from those used by vpclmulqdq and other
+ // instructions used in GHASH. For example, many Intel CPUs dispatch
+ // vaesenc to ports 0 and 1 and vpclmulqdq to port 5.
+ //
+ // The interleaving is easiest to do during decryption, since during
+ // decryption the ciphertext blocks are immediately available. For
+ // encryption, instead encrypt the first set of blocks, then hash those
+ // blocks while encrypting the next set of blocks, repeat that as
+ // needed, and finally hash the last set of blocks.
+
+.if \enc
+ // Encrypt the first 4 vectors of plaintext blocks. Leave the resulting
+ // ciphertext in GHASHDATA[0-3] for GHASH.
+ _ctr_begin_4x
+ lea 16(KEY), %rax
+1:
+ vbroadcasti32x4 (%rax), RNDKEY
+ _vaesenc_4x RNDKEY
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+ vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
+ vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
+ vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
+ vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
+ vaesenclast RNDKEYLAST0, V0, GHASHDATA0
+ vaesenclast RNDKEYLAST1, V1, GHASHDATA1
+ vaesenclast RNDKEYLAST2, V2, GHASHDATA2
+ vaesenclast RNDKEYLAST3, V3, GHASHDATA3
+ vmovdqu8 GHASHDATA0, 0*VL(DST)
+ vmovdqu8 GHASHDATA1, 1*VL(DST)
+ vmovdqu8 GHASHDATA2, 2*VL(DST)
+ vmovdqu8 GHASHDATA3, 3*VL(DST)
+ add $4*VL, SRC
+ add $4*VL, DST
+ sub $4*VL, DATALEN
+ jl .Lghash_last_ciphertext_4x\@
+.endif
+
+ // Cache as many additional AES round keys as possible.
+.irp i, 9,8,7,6,5
+ vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
+.endr
+
+.Lcrypt_loop_4x\@:
+
+ // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If
+ // encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
+.if !\enc
+ vmovdqu8 0*VL(SRC), GHASHDATA0
+ vmovdqu8 1*VL(SRC), GHASHDATA1
+ vmovdqu8 2*VL(SRC), GHASHDATA2
+ vmovdqu8 3*VL(SRC), GHASHDATA3
+.endif
+
+ // Start the AES encryption of the counter blocks.
+ _ctr_begin_4x
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ vbroadcasti32x4 -13*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+ vbroadcasti32x4 -12*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+192:
+ vbroadcasti32x4 -11*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+ vbroadcasti32x4 -10*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+128:
+
+ // XOR the source data with the last round key, saving the result in
+ // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the
+ // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+.if \enc
+ vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
+ vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
+ vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
+ vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
+.else
+ vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0
+ vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1
+ vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2
+ vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3
+.endif
+
+ // Finish the AES encryption of the counter blocks in V0-V3, interleaved
+ // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
+.irp i, 9,8,7,6,5
+ _vaesenc_4x RNDKEY_M\i
+ _ghash_step_4x (9 - \i)
+.endr
+.irp i, 4,3,2,1
+ vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY
+ _vaesenc_4x RNDKEY
+ _ghash_step_4x (9 - \i)
+.endr
+ _ghash_step_4x 9
+
+ // Do the last AES round. This handles the XOR with the source data
+ // too, as per the optimization described above.
+ vaesenclast RNDKEYLAST0, V0, GHASHDATA0
+ vaesenclast RNDKEYLAST1, V1, GHASHDATA1
+ vaesenclast RNDKEYLAST2, V2, GHASHDATA2
+ vaesenclast RNDKEYLAST3, V3, GHASHDATA3
+
+ // Store the en/decrypted data to DST.
+ vmovdqu8 GHASHDATA0, 0*VL(DST)
+ vmovdqu8 GHASHDATA1, 1*VL(DST)
+ vmovdqu8 GHASHDATA2, 2*VL(DST)
+ vmovdqu8 GHASHDATA3, 3*VL(DST)
+
+ add $4*VL, SRC
+ add $4*VL, DST
+ sub $4*VL, DATALEN
+ jge .Lcrypt_loop_4x\@
+
+.if \enc
+.Lghash_last_ciphertext_4x\@:
+ // Update GHASH with the last set of ciphertext blocks.
+.irp i, 0,1,2,3,4,5,6,7,8,9
+ _ghash_step_4x \i
+.endr
+.endif
+
+.Lcrypt_loop_4x_done\@:
+
+ // Undo the extra subtraction by 4*VL and check whether data remains.
+ add $4*VL, DATALEN
+ jz .Ldone\@
+
+ // The data length isn't a multiple of 4*VL. Process the remaining data
+ // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
+ // Going one vector at a time may seem inefficient compared to having
+ // separate code paths for each possible number of vectors remaining.
+ // However, using a loop keeps the code size down, and it performs
+ // surprising well; modern CPUs will start executing the next iteration
+ // before the previous one finishes and also predict the number of loop
+ // iterations. For a similar reason, we roll up the AES rounds.
+ //
+ // On the last iteration, the remaining length may be less than VL.
+ // Handle this using masking.
+ //
+ // Since there are enough key powers available for all remaining data,
+ // there is no need to do a GHASH reduction after each iteration.
+ // Instead, multiply each remaining block by its own key power, and only
+ // do a GHASH reduction at the very end.
+
+ // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+ // is the number of blocks that remain.
+ .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused.
+ mov DATALEN, %eax
+ neg %rax
+ and $~15, %rax // -round_up(DATALEN, 16)
+ lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR
+
+ // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+ .set LO, GHASHDATA0
+ .set LO_XMM, GHASHDATA0_XMM
+ .set MI, GHASHDATA1
+ .set MI_XMM, GHASHDATA1_XMM
+ .set HI, GHASHDATA2
+ .set HI_XMM, GHASHDATA2_XMM
+ vpxor LO_XMM, LO_XMM, LO_XMM
+ vpxor MI_XMM, MI_XMM, MI_XMM
+ vpxor HI_XMM, HI_XMM, HI_XMM
+
+.Lcrypt_loop_1x\@:
+
+ // Select the appropriate mask for this iteration: all 1's if
+ // DATALEN >= VL, otherwise DATALEN 1's. Do this branchlessly using the
+ // bzhi instruction from BMI2. (This relies on DATALEN <= 255.)
+.if VL < 64
+ mov $-1, %eax
+ bzhi DATALEN, %eax, %eax
+ kmovd %eax, %k1
+.else
+ mov $-1, %rax
+ bzhi DATALEN64, %rax, %rax
+ kmovq %rax, %k1
+.endif
+
+ // Encrypt a vector of counter blocks. This does not need to be masked.
+ vpshufb BSWAP_MASK, LE_CTR, V0
+ vpaddd LE_CTR_INC, LE_CTR, LE_CTR
+ vpxord RNDKEY0, V0, V0
+ lea 16(KEY), %rax
+1:
+ vbroadcasti32x4 (%rax), RNDKEY
+ vaesenc RNDKEY, V0, V0
+ add $16, %rax
+ cmp %rax, RNDKEYLAST_PTR
+ jne 1b
+ vaesenclast RNDKEYLAST, V0, V0
+
+ // XOR the data with the appropriate number of keystream bytes.
+ vmovdqu8 (SRC), V1{%k1}{z}
+ vpxord V1, V0, V0
+ vmovdqu8 V0, (DST){%k1}
+
+ // Update GHASH with the ciphertext block(s), without reducing.
+ //
+ // In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
+ // (If decrypting, it's done by the above masked load. If encrypting,
+ // it's done by the below masked register-to-register move.) Note that
+ // if DATALEN <= VL - 16, there will be additional padding beyond the
+ // padding of the last block specified by GHASH itself; i.e., there may
+ // be whole block(s) that get processed by the GHASH multiplication and
+ // reduction instructions but should not actually be included in the
+ // GHASH. However, any such blocks are all-zeroes, and the values that
+ // they're multiplied with are also all-zeroes. Therefore they just add
+ // 0 * 0 = 0 to the final GHASH result, which makes no difference.
+ vmovdqu8 (POWERS_PTR), H_POW1
+.if \enc
+ vmovdqu8 V0, V1{%k1}{z}
+.endif
+ vpshufb BSWAP_MASK, V1, V0
+ vpxord GHASH_ACC, V0, V0
+ _ghash_mul_noreduce H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3
+ vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+
+ add $VL, POWERS_PTR
+ add $VL, SRC
+ add $VL, DST
+ sub $VL, DATALEN
+ jg .Lcrypt_loop_1x\@
+
+ // Finally, do the GHASH reduction.
+ _ghash_reduce LO, MI, HI, GFPOLY, V0
+ _horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2
+
+.Ldone\@:
+ // Store the updated GHASH accumulator back to memory.
+ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+ vzeroupper // This is needed after using ymm or zmm registers.
+ RET
+.endm
+
+// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+// const u32 le_ctr[4], u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+// const u32 le_ctr[4],
+// const u8 ghash_acc[16],
+// u64 total_aadlen, u64 total_datalen,
+// const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one). Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|. The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro _aes_gcm_final enc
+
+ // Function arguments
+ .set KEY, %rdi
+ .set LE_CTR_PTR, %rsi
+ .set GHASH_ACC_PTR, %rdx
+ .set TOTAL_AADLEN, %rcx
+ .set TOTAL_DATALEN, %r8
+ .set TAG, %r9
+ .set TAGLEN, %r10d // Originally at 8(%rsp)
+
+ // Additional local variables.
+ // %rax, %xmm0-%xmm3, and %k1 are used as temporary registers.
+ .set AESKEYLEN, %r11d
+ .set AESKEYLEN64, %r11
+ .set GFPOLY, %xmm4
+ .set BSWAP_MASK, %xmm5
+ .set LE_CTR, %xmm6
+ .set GHASH_ACC, %xmm7
+ .set H_POW1, %xmm8
+
+ // Load some constants.
+ vmovdqa .Lgfpoly(%rip), GFPOLY
+ vmovdqa .Lbswap_mask(%rip), BSWAP_MASK
+
+ // Load the AES key length in bytes.
+ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+ // Set up a counter block with 1 in the low 32-bit word. This is the
+ // counter that produces the ciphertext needed to encrypt the auth tag.
+ // GFPOLY has 1 in the low word, so grab the 1 from there using a blend.
+ vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR
+
+ // Build the lengths block and XOR it with the GHASH accumulator.
+ // Although the lengths block is defined as the AAD length followed by
+ // the en/decrypted data length, both in big-endian byte order, a byte
+ // reflection of the full block is needed because of the way we compute
+ // GHASH (see _ghash_mul_step). By using little-endian values in the
+ // opposite order, we avoid having to reflect any bytes here.
+ vmovq TOTAL_DATALEN, %xmm0
+ vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0
+ vpsllq $3, %xmm0, %xmm0 // Bytes to bits
+ vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC
+
+ // Load the first hash key power (H^1), which is stored last.
+ vmovdqu8 OFFSETOFEND_H_POWERS-16(KEY), H_POW1
+
+.if !\enc
+ // Prepare a mask of TAGLEN one bits.
+ movl 8(%rsp), TAGLEN
+ mov $-1, %eax
+ bzhi TAGLEN, %eax, %eax
+ kmovd %eax, %k1
+.endif
+
+ // Make %rax point to the last AES round key for the chosen AES variant.
+ lea 6*16(KEY,AESKEYLEN64,4), %rax
+
+ // Start the AES encryption of the counter block by swapping the counter
+ // block to big-endian and XOR-ing it with the zero-th AES round key.
+ vpshufb BSWAP_MASK, LE_CTR, %xmm0
+ vpxor (KEY), %xmm0, %xmm0
+
+ // Complete the AES encryption and multiply GHASH_ACC by H^1.
+ // Interleave the AES and GHASH instructions to improve performance.
+ cmp $24, AESKEYLEN
+ jl 128f // AES-128?
+ je 192f // AES-192?
+ // AES-256
+ vaesenc -13*16(%rax), %xmm0, %xmm0
+ vaesenc -12*16(%rax), %xmm0, %xmm0
+192:
+ vaesenc -11*16(%rax), %xmm0, %xmm0
+ vaesenc -10*16(%rax), %xmm0, %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+ _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %xmm1, %xmm2, %xmm3
+ vaesenc (\i-9)*16(%rax), %xmm0, %xmm0
+.endr
+ _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %xmm1, %xmm2, %xmm3
+
+ // Undo the byte reflection of the GHASH accumulator.
+ vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC
+
+ // Do the last AES round and XOR the resulting keystream block with the
+ // GHASH accumulator to produce the full computed authentication tag.
+ //
+ // Reduce latency by taking advantage of the property vaesenclast(key,
+ // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last
+ // round key, instead of XOR'ing the final AES output with GHASH_ACC.
+ //
+ // enc_final then returns the computed auth tag, while dec_final
+ // compares it with the transmitted one and returns a bool. To compare
+ // the tags, dec_final XORs them together and uses vptest to check
+ // whether the result is all-zeroes. This should be constant-time.
+ // dec_final applies the vaesenclast optimization to this additional
+ // value XOR'd too, using vpternlogd to XOR the last round key, GHASH
+ // accumulator, and transmitted auth tag together in one instruction.
+.if \enc
+ vpxor (%rax), GHASH_ACC, %xmm1
+ vaesenclast %xmm1, %xmm0, GHASH_ACC
+ vmovdqu GHASH_ACC, (GHASH_ACC_PTR)
+.else
+ vmovdqu (TAG), %xmm1
+ vpternlogd $0x96, (%rax), GHASH_ACC, %xmm1
+ vaesenclast %xmm1, %xmm0, %xmm0
+ xor %eax, %eax
+ vmovdqu8 %xmm0, %xmm0{%k1}{z} // Truncate to TAGLEN bytes
+ vptest %xmm0, %xmm0
+ sete %al
+.endif
+ // No need for vzeroupper here, since only used xmm registers were used.
+ RET
+.endm
+
+_set_veclen 32
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256)
+ _aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256)
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256)
+
+_set_veclen 64
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512)
+ _aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512)
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512)
+ _aes_gcm_update 1
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512)
+ _aes_gcm_update 0
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512)
+
+// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+// u8 ghash_acc[16],
+// const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been
+// initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen|
+// must be a multiple of 16, except on the last call where it can be any length.
+// The caller must do any buffering needed to ensure this.
+//
+// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes.
+// Therefore, for AAD processing we currently only provide this implementation
+// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This
+// keeps the code size down, and it enables some micro-optimizations, e.g. using
+// VEX-coded instructions instead of EVEX-coded to save some instruction bytes.
+// To optimize for large amounts of AAD, we could implement a 4x-wide loop and
+// provide a version using 512-bit vectors, but that doesn't seem to be useful.
+SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10)
+
+ // Function arguments
+ .set KEY, %rdi
+ .set GHASH_ACC_PTR, %rsi
+ .set AAD, %rdx
+ .set AADLEN, %ecx
+ .set AADLEN64, %rcx // Zero-extend AADLEN before using!
+
+ // Additional local variables.
+ // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers.
+ .set BSWAP_MASK, %ymm4
+ .set GFPOLY, %ymm5
+ .set GHASH_ACC, %ymm6
+ .set GHASH_ACC_XMM, %xmm6
+ .set H_POW1, %ymm7
+
+ // Load some constants.
+ vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK
+ vbroadcasti128 .Lgfpoly(%rip), GFPOLY
+
+ // Load the GHASH accumulator.
+ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM
+
+ // Update GHASH with 32 bytes of AAD at a time.
+ //
+ // Pre-subtracting 32 from AADLEN saves an instruction from the loop and
+ // also ensures that at least one write always occurs to AADLEN,
+ // zero-extending it and allowing AADLEN64 to be used later.
+ sub $32, AADLEN
+ jl .Laad_loop_1x_done
+ vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1]
+.Laad_loop_1x:
+ vmovdqu (AAD), %ymm0
+ vpshufb BSWAP_MASK, %ymm0, %ymm0
+ vpxor %ymm0, GHASH_ACC, GHASH_ACC
+ _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %ymm0, %ymm1, %ymm2
+ vextracti128 $1, GHASH_ACC, %xmm0
+ vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
+ add $32, AAD
+ sub $32, AADLEN
+ jge .Laad_loop_1x
+.Laad_loop_1x_done:
+ add $32, AADLEN
+ jz .Laad_done
+
+ // Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD.
+ mov $-1, %eax
+ bzhi AADLEN, %eax, %eax
+ kmovd %eax, %k1
+ vmovdqu8 (AAD), %ymm0{%k1}{z}
+ neg AADLEN64
+ and $~15, AADLEN64 // -round_up(AADLEN, 16)
+ vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
+ vpshufb BSWAP_MASK, %ymm0, %ymm0
+ vpxor %ymm0, GHASH_ACC, GHASH_ACC
+ _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+ %ymm0, %ymm1, %ymm2
+ vextracti128 $1, GHASH_ACC, %xmm0
+ vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
+
+.Laad_done:
+ // Store the updated GHASH accumulator back to memory.
+ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+ vzeroupper // This is needed after using ymm or zmm registers.
+ RET
+SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10)
+
+SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10)
+ _aes_gcm_final 1
+SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10)
+SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10)
+ _aes_gcm_final 0
+SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10)
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
new file mode 100644
index 000000000000..48f97b79f7a9
--- /dev/null
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -0,0 +1,845 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AES-XTS for modern x86_64 CPUs
+ *
+ * Copyright 2024 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+/*
+ * This file implements AES-XTS for modern x86_64 CPUs. To handle the
+ * complexities of coding for x86 SIMD, e.g. where every vector length needs
+ * different code, it uses a macro to generate several implementations that
+ * share similar source code but are targeted at different CPUs, listed below:
+ *
+ * AES-NI + AVX
+ * - 128-bit vectors (1 AES block per vector)
+ * - VEX-coded instructions
+ * - xmm0-xmm15
+ * - This is for older CPUs that lack VAES but do have AVX.
+ *
+ * VAES + VPCLMULQDQ + AVX2
+ * - 256-bit vectors (2 AES blocks per vector)
+ * - VEX-coded instructions
+ * - ymm0-ymm15
+ * - This is for CPUs that have VAES but lack AVX512 or AVX10,
+ * e.g. Intel's Alder Lake and AMD's Zen 3.
+ *
+ * VAES + VPCLMULQDQ + AVX10/256 + BMI2
+ * - 256-bit vectors (2 AES blocks per vector)
+ * - EVEX-coded instructions
+ * - ymm0-ymm31
+ * - This is for CPUs that have AVX512 but where using zmm registers causes
+ * downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
+ * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
+ * To avoid confusion with 512-bit, we just write AVX10/256.
+ *
+ * VAES + VPCLMULQDQ + AVX10/512 + BMI2
+ * - Same as the previous one, but upgrades to 512-bit vectors
+ * (4 AES blocks per vector) in zmm0-zmm31.
+ * - This is for CPUs that have good AVX512 or AVX10/512 support.
+ *
+ * This file doesn't have an implementation for AES-NI alone (without AVX), as
+ * the lack of VEX would make all the assembly code different.
+ *
+ * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
+ * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be
+ * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might
+ * need to start also providing an implementation using VAES alone.
+ *
+ * The AES-XTS implementations in this file support everything required by the
+ * crypto API, including support for arbitrary input lengths and multi-part
+ * processing. However, they are most heavily optimized for the common case of
+ * power-of-2 length inputs that are processed in a single part (disk sectors).
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+.section .rodata
+.p2align 4
+.Lgf_poly:
+ // The low 64 bits of this value represent the polynomial x^7 + x^2 + x
+ // + 1. It is the value that must be XOR'd into the low 64 bits of the
+ // tweak each time a 1 is carried out of the high 64 bits.
+ //
+ // The high 64 bits of this value is just the internal carry bit that
+ // exists when there's a carry out of the low 64 bits of the tweak.
+ .quad 0x87, 1
+
+ // This table contains constants for vpshufb and vpblendvb, used to
+ // handle variable byte shifts and blending during ciphertext stealing
+ // on CPUs that don't support AVX10-style masking.
+.Lcts_permute_table:
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+.text
+
+// Function parameters
+.set KEY, %rdi // Initially points to crypto_aes_ctx, then is
+ // advanced to point to 7th-from-last round key
+.set SRC, %rsi // Pointer to next source data
+.set DST, %rdx // Pointer to next destination data
+.set LEN, %ecx // Remaining length in bytes
+.set LEN8, %cl
+.set LEN64, %rcx
+.set TWEAK, %r8 // Pointer to next tweak
+
+// %rax holds the AES key length in bytes.
+.set KEYLEN, %eax
+.set KEYLEN64, %rax
+
+// %r9-r11 are available as temporaries.
+
+.macro _define_Vi i
+.if VL == 16
+ .set V\i, %xmm\i
+.elseif VL == 32
+ .set V\i, %ymm\i
+.elseif VL == 64
+ .set V\i, %zmm\i
+.else
+ .error "Unsupported Vector Length (VL)"
+.endif
+.endm
+
+.macro _define_aliases
+ // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
+ // are available, that map to the xmm, ymm, or zmm registers according
+ // to the selected Vector Length (VL).
+ _define_Vi 0
+ _define_Vi 1
+ _define_Vi 2
+ _define_Vi 3
+ _define_Vi 4
+ _define_Vi 5
+ _define_Vi 6
+ _define_Vi 7
+ _define_Vi 8
+ _define_Vi 9
+ _define_Vi 10
+ _define_Vi 11
+ _define_Vi 12
+ _define_Vi 13
+ _define_Vi 14
+ _define_Vi 15
+.if USE_AVX10
+ _define_Vi 16
+ _define_Vi 17
+ _define_Vi 18
+ _define_Vi 19
+ _define_Vi 20
+ _define_Vi 21
+ _define_Vi 22
+ _define_Vi 23
+ _define_Vi 24
+ _define_Vi 25
+ _define_Vi 26
+ _define_Vi 27
+ _define_Vi 28
+ _define_Vi 29
+ _define_Vi 30
+ _define_Vi 31
+.endif
+
+ // V0-V3 hold the data blocks during the main loop, or temporary values
+ // otherwise. V4-V5 hold temporary values.
+
+ // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak.
+ .set TWEAK0_XMM, %xmm6
+ .set TWEAK0, V6
+ .set TWEAK1_XMM, %xmm7
+ .set TWEAK1, V7
+ .set TWEAK2, V8
+ .set TWEAK3, V9
+
+ // V10-V13 are used for computing the next values of TWEAK[0-3].
+ .set NEXT_TWEAK0, V10
+ .set NEXT_TWEAK1, V11
+ .set NEXT_TWEAK2, V12
+ .set NEXT_TWEAK3, V13
+
+ // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
+ .set GF_POLY_XMM, %xmm14
+ .set GF_POLY, V14
+
+ // V15 holds the key for AES "round 0", copied to all 128-bit lanes.
+ .set KEY0_XMM, %xmm15
+ .set KEY0, V15
+
+ // If 32 SIMD registers are available, then V16-V29 hold the remaining
+ // AES round keys, copied to all 128-bit lanes.
+ //
+ // AES-128, AES-192, and AES-256 use different numbers of round keys.
+ // To allow handling all three variants efficiently, we align the round
+ // keys to the *end* of this register range. I.e., AES-128 uses
+ // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
+ // (All also use KEY0 for the XOR-only "round" at the beginning.)
+.if USE_AVX10
+ .set KEY1_XMM, %xmm16
+ .set KEY1, V16
+ .set KEY2_XMM, %xmm17
+ .set KEY2, V17
+ .set KEY3_XMM, %xmm18
+ .set KEY3, V18
+ .set KEY4_XMM, %xmm19
+ .set KEY4, V19
+ .set KEY5_XMM, %xmm20
+ .set KEY5, V20
+ .set KEY6_XMM, %xmm21
+ .set KEY6, V21
+ .set KEY7_XMM, %xmm22
+ .set KEY7, V22
+ .set KEY8_XMM, %xmm23
+ .set KEY8, V23
+ .set KEY9_XMM, %xmm24
+ .set KEY9, V24
+ .set KEY10_XMM, %xmm25
+ .set KEY10, V25
+ .set KEY11_XMM, %xmm26
+ .set KEY11, V26
+ .set KEY12_XMM, %xmm27
+ .set KEY12, V27
+ .set KEY13_XMM, %xmm28
+ .set KEY13, V28
+ .set KEY14_XMM, %xmm29
+ .set KEY14, V29
+.endif
+ // V30-V31 are currently unused.
+.endm
+
+// Move a vector between memory and a register.
+.macro _vmovdqu src, dst
+.if VL < 64
+ vmovdqu \src, \dst
+.else
+ vmovdqu8 \src, \dst
+.endif
+.endm
+
+// Broadcast a 128-bit value into a vector.
+.macro _vbroadcast128 src, dst
+.if VL == 16 && !USE_AVX10
+ vmovdqu \src, \dst
+.elseif VL == 32 && !USE_AVX10
+ vbroadcasti128 \src, \dst
+.else
+ vbroadcasti32x4 \src, \dst
+.endif
+.endm
+
+// XOR two vectors together.
+.macro _vpxor src1, src2, dst
+.if USE_AVX10
+ vpxord \src1, \src2, \dst
+.else
+ vpxor \src1, \src2, \dst
+.endif
+.endm
+
+// XOR three vectors together.
+.macro _xor3 src1, src2, src3_and_dst
+.if USE_AVX10
+ // vpternlogd with immediate 0x96 is a three-argument XOR.
+ vpternlogd $0x96, \src1, \src2, \src3_and_dst
+.else
+ vpxor \src1, \src3_and_dst, \src3_and_dst
+ vpxor \src2, \src3_and_dst, \src3_and_dst
+.endif
+.endm
+
+// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
+// (by multiplying by the polynomial 'x') and write it to \dst.
+.macro _next_tweak src, tmp, dst
+ vpshufd $0x13, \src, \tmp
+ vpaddq \src, \src, \dst
+ vpsrad $31, \tmp, \tmp
+ vpand GF_POLY_XMM, \tmp, \tmp
+ vpxor \tmp, \dst, \dst
+.endm
+
+// Given the XTS tweak(s) in the vector \src, compute the next vector of
+// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
+//
+// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
+// all tweaks in the vector in parallel. If VL=16, we just do the regular
+// computation without vpclmulqdq, as it's the faster method for a single tweak.
+.macro _next_tweakvec src, tmp1, tmp2, dst
+.if VL == 16
+ _next_tweak \src, \tmp1, \dst
+.else
+ vpsrlq $64 - VL/16, \src, \tmp1
+ vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2
+ vpslldq $8, \tmp1, \tmp1
+ vpsllq $VL/16, \src, \dst
+ _xor3 \tmp1, \tmp2, \dst
+.endif
+.endm
+
+// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
+// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5.
+.macro _compute_first_set_of_tweaks
+ vmovdqu (TWEAK), TWEAK0_XMM
+ _vbroadcast128 .Lgf_poly(%rip), GF_POLY
+.if VL == 16
+ // With VL=16, multiplying by x serially is fastest.
+ _next_tweak TWEAK0, %xmm0, TWEAK1
+ _next_tweak TWEAK1, %xmm0, TWEAK2
+ _next_tweak TWEAK2, %xmm0, TWEAK3
+.else
+.if VL == 32
+ // Compute the second block of TWEAK0.
+ _next_tweak TWEAK0_XMM, %xmm0, %xmm1
+ vinserti128 $1, %xmm1, TWEAK0, TWEAK0
+.elseif VL == 64
+ // Compute the remaining blocks of TWEAK0.
+ _next_tweak TWEAK0_XMM, %xmm0, %xmm1
+ _next_tweak %xmm1, %xmm0, %xmm2
+ _next_tweak %xmm2, %xmm0, %xmm3
+ vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0
+ vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0
+ vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0
+.endif
+ // Compute TWEAK[1-3] from TWEAK0.
+ vpsrlq $64 - 1*VL/16, TWEAK0, V0
+ vpsrlq $64 - 2*VL/16, TWEAK0, V2
+ vpsrlq $64 - 3*VL/16, TWEAK0, V4
+ vpclmulqdq $0x01, GF_POLY, V0, V1
+ vpclmulqdq $0x01, GF_POLY, V2, V3
+ vpclmulqdq $0x01, GF_POLY, V4, V5
+ vpslldq $8, V0, V0
+ vpslldq $8, V2, V2
+ vpslldq $8, V4, V4
+ vpsllq $1*VL/16, TWEAK0, TWEAK1
+ vpsllq $2*VL/16, TWEAK0, TWEAK2
+ vpsllq $3*VL/16, TWEAK0, TWEAK3
+.if USE_AVX10
+ vpternlogd $0x96, V0, V1, TWEAK1
+ vpternlogd $0x96, V2, V3, TWEAK2
+ vpternlogd $0x96, V4, V5, TWEAK3
+.else
+ vpxor V0, TWEAK1, TWEAK1
+ vpxor V2, TWEAK2, TWEAK2
+ vpxor V4, TWEAK3, TWEAK3
+ vpxor V1, TWEAK1, TWEAK1
+ vpxor V3, TWEAK2, TWEAK2
+ vpxor V5, TWEAK3, TWEAK3
+.endif
+.endif
+.endm
+
+// Do one step in computing the next set of tweaks using the method of just
+// multiplying by x repeatedly (the same method _next_tweak uses).
+.macro _tweak_step_mulx i
+.if \i == 0
+ .set PREV_TWEAK, TWEAK3
+ .set NEXT_TWEAK, NEXT_TWEAK0
+.elseif \i == 5
+ .set PREV_TWEAK, NEXT_TWEAK0
+ .set NEXT_TWEAK, NEXT_TWEAK1
+.elseif \i == 10
+ .set PREV_TWEAK, NEXT_TWEAK1
+ .set NEXT_TWEAK, NEXT_TWEAK2
+.elseif \i == 15
+ .set PREV_TWEAK, NEXT_TWEAK2
+ .set NEXT_TWEAK, NEXT_TWEAK3
+.endif
+.if \i >= 0 && \i < 20 && \i % 5 == 0
+ vpshufd $0x13, PREV_TWEAK, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 1
+ vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
+.elseif \i >= 0 && \i < 20 && \i % 5 == 2
+ vpsrad $31, V5, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 3
+ vpand GF_POLY, V5, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 4
+ vpxor V5, NEXT_TWEAK, NEXT_TWEAK
+.elseif \i == 1000
+ vmovdqa NEXT_TWEAK0, TWEAK0
+ vmovdqa NEXT_TWEAK1, TWEAK1
+ vmovdqa NEXT_TWEAK2, TWEAK2
+ vmovdqa NEXT_TWEAK3, TWEAK3
+.endif
+.endm
+
+// Do one step in computing the next set of tweaks using the VPCLMULQDQ method
+// (the same method _next_tweakvec uses for VL > 16). This means multiplying
+// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8
+// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
+// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
+.macro _tweak_step_pclmul i
+.if \i == 0
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
+.elseif \i == 2
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
+.elseif \i == 4
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
+.elseif \i == 6
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
+.elseif \i == 8
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
+.elseif \i == 10
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
+.elseif \i == 12
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
+.elseif \i == 14
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
+.elseif \i == 1000
+ vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0
+ vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1
+ vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2
+ vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3
+ _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0
+ _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1
+ _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2
+ _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3
+.endif
+.endm
+
+// _tweak_step does one step of the computation of the next set of tweaks from
+// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of
+// \i that include at least 0 through 19, then 1000 which signals the last step.
+//
+// This is used to interleave the computation of the next set of tweaks with the
+// AES en/decryptions, which increases performance in some cases.
+.macro _tweak_step i
+.if VL == 16
+ _tweak_step_mulx \i
+.else
+ _tweak_step_pclmul \i
+.endif
+.endm
+
+.macro _setup_round_keys enc
+
+ // Select either the encryption round keys or the decryption round keys.
+.if \enc
+ .set OFFS, 0
+.else
+ .set OFFS, 240
+.endif
+
+ // Load the round key for "round 0".
+ _vbroadcast128 OFFS(KEY), KEY0
+
+ // Increment KEY to make it so that 7*16(KEY) is the last round key.
+ // For AES-128, increment by 3*16, resulting in the 10 round keys (not
+ // counting the zero-th round key which was just loaded into KEY0) being
+ // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use
+ // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment
+ // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
+ //
+ // This rebasing provides two benefits. First, it makes the offset to
+ // any round key be in the range [-96, 112], fitting in a signed byte.
+ // This shortens VEX-encoded instructions that access the later round
+ // keys which otherwise would need 4-byte offsets. Second, it makes it
+ // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
+ // beginning. Skipping rounds at the end doesn't work as well because
+ // the last round needs different instructions.
+ //
+ // An alternative approach would be to roll up all the round loops. We
+ // don't do that because it isn't compatible with caching the round keys
+ // in registers which we do when possible (see below), and also because
+ // it seems unwise to rely *too* heavily on the CPU's branch predictor.
+ lea OFFS-16(KEY, KEYLEN64, 4), KEY
+
+ // If all 32 SIMD registers are available, cache all the round keys.
+.if USE_AVX10
+ cmp $24, KEYLEN
+ jl .Laes128\@
+ je .Laes192\@
+ _vbroadcast128 -6*16(KEY), KEY1
+ _vbroadcast128 -5*16(KEY), KEY2
+.Laes192\@:
+ _vbroadcast128 -4*16(KEY), KEY3
+ _vbroadcast128 -3*16(KEY), KEY4
+.Laes128\@:
+ _vbroadcast128 -2*16(KEY), KEY5
+ _vbroadcast128 -1*16(KEY), KEY6
+ _vbroadcast128 0*16(KEY), KEY7
+ _vbroadcast128 1*16(KEY), KEY8
+ _vbroadcast128 2*16(KEY), KEY9
+ _vbroadcast128 3*16(KEY), KEY10
+ _vbroadcast128 4*16(KEY), KEY11
+ _vbroadcast128 5*16(KEY), KEY12
+ _vbroadcast128 6*16(KEY), KEY13
+ _vbroadcast128 7*16(KEY), KEY14
+.endif
+.endm
+
+// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
+// on the block(s) in \data using the round key(s) in \key. The register length
+// determines the number of AES blocks en/decrypted.
+.macro _vaes enc, last, key, data
+.if \enc
+.if \last
+ vaesenclast \key, \data, \data
+.else
+ vaesenc \key, \data, \data
+.endif
+.else
+.if \last
+ vaesdeclast \key, \data, \data
+.else
+ vaesdec \key, \data, \data
+.endif
+.endif
+.endm
+
+// Do a single round of AES en/decryption on the block(s) in \data, using the
+// same key for all block(s). The round key is loaded from the appropriate
+// register or memory location for round \i. May clobber V4.
+.macro _vaes_1x enc, last, i, xmm_suffix, data
+.if USE_AVX10
+ _vaes \enc, \last, KEY\i\xmm_suffix, \data
+.else
+.ifnb \xmm_suffix
+ _vaes \enc, \last, (\i-7)*16(KEY), \data
+.else
+ _vbroadcast128 (\i-7)*16(KEY), V4
+ _vaes \enc, \last, V4, \data
+.endif
+.endif
+.endm
+
+// Do a single round of AES en/decryption on the blocks in registers V0-V3,
+// using the same key for all blocks. The round key is loaded from the
+// appropriate register or memory location for round \i. In addition, does two
+// steps of the computation of the next set of tweaks. May clobber V4.
+.macro _vaes_4x enc, last, i
+.if USE_AVX10
+ _tweak_step (2*(\i-5))
+ _vaes \enc, \last, KEY\i, V0
+ _vaes \enc, \last, KEY\i, V1
+ _tweak_step (2*(\i-5) + 1)
+ _vaes \enc, \last, KEY\i, V2
+ _vaes \enc, \last, KEY\i, V3
+.else
+ _vbroadcast128 (\i-7)*16(KEY), V4
+ _tweak_step (2*(\i-5))
+ _vaes \enc, \last, V4, V0
+ _vaes \enc, \last, V4, V1
+ _tweak_step (2*(\i-5) + 1)
+ _vaes \enc, \last, V4, V2
+ _vaes \enc, \last, V4, V3
+.endif
+.endm
+
+// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
+// then XOR with \tweak again) of the block(s) in \data. To process a single
+// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of
+// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4.
+.macro _aes_crypt enc, xmm_suffix, tweak, data
+ _xor3 KEY0\xmm_suffix, \tweak, \data
+ cmp $24, KEYLEN
+ jl .Laes128\@
+ je .Laes192\@
+ _vaes_1x \enc, 0, 1, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 2, \xmm_suffix, \data
+.Laes192\@:
+ _vaes_1x \enc, 0, 3, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 4, \xmm_suffix, \data
+.Laes128\@:
+ _vaes_1x \enc, 0, 5, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 6, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 7, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 8, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 9, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 10, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 11, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 12, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 13, \xmm_suffix, \data
+ _vaes_1x \enc, 1, 14, \xmm_suffix, \data
+ _vpxor \tweak, \data, \data
+.endm
+
+.macro _aes_xts_crypt enc
+ _define_aliases
+
+.if !\enc
+ // When decrypting a message whose length isn't a multiple of the AES
+ // block length, exclude the last full block from the main loop by
+ // subtracting 16 from LEN. This is needed because ciphertext stealing
+ // decryption uses the last two tweaks in reverse order. We'll handle
+ // the last full block and the partial block specially at the end.
+ lea -16(LEN), %eax
+ test $15, LEN8
+ cmovnz %eax, LEN
+.endif
+
+ // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
+ movl 480(KEY), KEYLEN
+
+ // Setup the pointer to the round keys and cache as many as possible.
+ _setup_round_keys \enc
+
+ // Compute the first set of tweaks TWEAK[0-3].
+ _compute_first_set_of_tweaks
+
+ sub $4*VL, LEN
+ jl .Lhandle_remainder\@
+
+.Lmain_loop\@:
+ // This is the main loop, en/decrypting 4*VL bytes per iteration.
+
+ // XOR each source block with its tweak and the zero-th round key.
+.if USE_AVX10
+ vmovdqu8 0*VL(SRC), V0
+ vmovdqu8 1*VL(SRC), V1
+ vmovdqu8 2*VL(SRC), V2
+ vmovdqu8 3*VL(SRC), V3
+ vpternlogd $0x96, TWEAK0, KEY0, V0
+ vpternlogd $0x96, TWEAK1, KEY0, V1
+ vpternlogd $0x96, TWEAK2, KEY0, V2
+ vpternlogd $0x96, TWEAK3, KEY0, V3
+.else
+ vpxor 0*VL(SRC), KEY0, V0
+ vpxor 1*VL(SRC), KEY0, V1
+ vpxor 2*VL(SRC), KEY0, V2
+ vpxor 3*VL(SRC), KEY0, V3
+ vpxor TWEAK0, V0, V0
+ vpxor TWEAK1, V1, V1
+ vpxor TWEAK2, V2, V2
+ vpxor TWEAK3, V3, V3
+.endif
+ cmp $24, KEYLEN
+ jl .Laes128\@
+ je .Laes192\@
+ // Do all the AES rounds on the data blocks, interleaved with
+ // the computation of the next set of tweaks.
+ _vaes_4x \enc, 0, 1
+ _vaes_4x \enc, 0, 2
+.Laes192\@:
+ _vaes_4x \enc, 0, 3
+ _vaes_4x \enc, 0, 4
+.Laes128\@:
+ _vaes_4x \enc, 0, 5
+ _vaes_4x \enc, 0, 6
+ _vaes_4x \enc, 0, 7
+ _vaes_4x \enc, 0, 8
+ _vaes_4x \enc, 0, 9
+ _vaes_4x \enc, 0, 10
+ _vaes_4x \enc, 0, 11
+ _vaes_4x \enc, 0, 12
+ _vaes_4x \enc, 0, 13
+ _vaes_4x \enc, 1, 14
+
+ // XOR in the tweaks again.
+ _vpxor TWEAK0, V0, V0
+ _vpxor TWEAK1, V1, V1
+ _vpxor TWEAK2, V2, V2
+ _vpxor TWEAK3, V3, V3
+
+ // Store the destination blocks.
+ _vmovdqu V0, 0*VL(DST)
+ _vmovdqu V1, 1*VL(DST)
+ _vmovdqu V2, 2*VL(DST)
+ _vmovdqu V3, 3*VL(DST)
+
+ // Finish computing the next set of tweaks.
+ _tweak_step 1000
+
+ add $4*VL, SRC
+ add $4*VL, DST
+ sub $4*VL, LEN
+ jge .Lmain_loop\@
+
+ // Check for the uncommon case where the data length isn't a multiple of
+ // 4*VL. Handle it out-of-line in order to optimize for the common
+ // case. In the common case, just fall through to the ret.
+ test $4*VL-1, LEN8
+ jnz .Lhandle_remainder\@
+.Ldone\@:
+ // Store the next tweak back to *TWEAK to support continuation calls.
+ vmovdqu TWEAK0_XMM, (TWEAK)
+.if VL > 16
+ vzeroupper
+.endif
+ RET
+
+.Lhandle_remainder\@:
+
+ // En/decrypt any remaining full blocks, one vector at a time.
+.if VL > 16
+ add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL.
+ jl .Lvec_at_a_time_done\@
+.Lvec_at_a_time\@:
+ _vmovdqu (SRC), V0
+ _aes_crypt \enc, , TWEAK0, V0
+ _vmovdqu V0, (DST)
+ _next_tweakvec TWEAK0, V0, V1, TWEAK0
+ add $VL, SRC
+ add $VL, DST
+ sub $VL, LEN
+ jge .Lvec_at_a_time\@
+.Lvec_at_a_time_done\@:
+ add $VL-16, LEN // Undo extra sub of VL, then sub 16.
+.else
+ add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16.
+.endif
+
+ // En/decrypt any remaining full blocks, one at a time.
+ jl .Lblock_at_a_time_done\@
+.Lblock_at_a_time\@:
+ vmovdqu (SRC), %xmm0
+ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
+ vmovdqu %xmm0, (DST)
+ _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM
+ add $16, SRC
+ add $16, DST
+ sub $16, LEN
+ jge .Lblock_at_a_time\@
+.Lblock_at_a_time_done\@:
+ add $16, LEN // Undo the extra sub of 16.
+ // Now 0 <= LEN <= 15. If LEN is zero, we're done.
+ jz .Ldone\@
+
+ // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
+ // Do ciphertext stealing to process the last 16 + LEN bytes.
+
+.if \enc
+ // If encrypting, the main loop already encrypted the last full block to
+ // create the CTS intermediate ciphertext. Prepare for the rest of CTS
+ // by rewinding the pointers and loading the intermediate ciphertext.
+ sub $16, SRC
+ sub $16, DST
+ vmovdqu (DST), %xmm0
+.else
+ // If decrypting, the main loop didn't decrypt the last full block
+ // because CTS decryption uses the last two tweaks in reverse order.
+ // Do it now by advancing the tweak and decrypting the last full block.
+ _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM
+ vmovdqu (SRC), %xmm0
+ _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0
+.endif
+
+.if USE_AVX10
+ // Create a mask that has the first LEN bits set.
+ mov $-1, %r9d
+ bzhi LEN, %r9d, %r9d
+ kmovd %r9d, %k1
+
+ // Swap the first LEN bytes of the en/decryption of the last full block
+ // with the partial block. Note that to support in-place en/decryption,
+ // the load from the src partial block must happen before the store to
+ // the dst partial block.
+ vmovdqa %xmm0, %xmm1
+ vmovdqu8 16(SRC), %xmm0{%k1}
+ vmovdqu8 %xmm1, 16(DST){%k1}
+.else
+ lea .Lcts_permute_table(%rip), %r9
+
+ // Load the src partial block, left-aligned. Note that to support
+ // in-place en/decryption, this must happen before the store to the dst
+ // partial block.
+ vmovdqu (SRC, LEN64, 1), %xmm1
+
+ // Shift the first LEN bytes of the en/decryption of the last full block
+ // to the end of a register, then store it to DST+LEN. This stores the
+ // dst partial block. It also writes to the second part of the dst last
+ // full block, but that part is overwritten later.
+ vpshufb (%r9, LEN64, 1), %xmm0, %xmm2
+ vmovdqu %xmm2, (DST, LEN64, 1)
+
+ // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
+ sub LEN64, %r9
+ vmovdqu 32(%r9), %xmm3
+
+ // Shift the src partial block to the beginning of its register.
+ vpshufb %xmm3, %xmm1, %xmm1
+
+ // Do a blend to generate the src partial block followed by the second
+ // part of the en/decryption of the last full block.
+ vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
+.endif
+ // En/decrypt again and store the last full block.
+ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
+ vmovdqu %xmm0, (DST)
+ jmp .Ldone\@
+.endm
+
+// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+// u8 iv[AES_BLOCK_SIZE]);
+SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
+ vmovdqu (%rsi), %xmm0
+ vpxor (%rdi), %xmm0, %xmm0
+ movl 480(%rdi), %eax // AES key length
+ lea -16(%rdi, %rax, 4), %rdi
+ cmp $24, %eax
+ jl .Lencrypt_iv_aes128
+ je .Lencrypt_iv_aes192
+ vaesenc -6*16(%rdi), %xmm0, %xmm0
+ vaesenc -5*16(%rdi), %xmm0, %xmm0
+.Lencrypt_iv_aes192:
+ vaesenc -4*16(%rdi), %xmm0, %xmm0
+ vaesenc -3*16(%rdi), %xmm0, %xmm0
+.Lencrypt_iv_aes128:
+ vaesenc -2*16(%rdi), %xmm0, %xmm0
+ vaesenc -1*16(%rdi), %xmm0, %xmm0
+ vaesenc 0*16(%rdi), %xmm0, %xmm0
+ vaesenc 1*16(%rdi), %xmm0, %xmm0
+ vaesenc 2*16(%rdi), %xmm0, %xmm0
+ vaesenc 3*16(%rdi), %xmm0, %xmm0
+ vaesenc 4*16(%rdi), %xmm0, %xmm0
+ vaesenc 5*16(%rdi), %xmm0, %xmm0
+ vaesenc 6*16(%rdi), %xmm0, %xmm0
+ vaesenclast 7*16(%rdi), %xmm0, %xmm0
+ vmovdqu %xmm0, (%rsi)
+ RET
+SYM_FUNC_END(aes_xts_encrypt_iv)
+
+// Below are the actual AES-XTS encryption and decryption functions,
+// instantiated from the above macro. They all have the following prototype:
+//
+// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
+// const u8 *src, u8 *dst, unsigned int len,
+// u8 tweak[AES_BLOCK_SIZE]);
+//
+// |key| is the data key. |tweak| contains the next tweak; the encryption of
+// the original IV with the tweak key was already done. This function supports
+// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
+// |len| must be a multiple of 16 except on the last call. If |len| is a
+// multiple of 16, then this function updates |tweak| to contain the next tweak.
+
+.set VL, 16
+.set USE_AVX10, 0
+SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
+
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+.set VL, 32
+.set USE_AVX10, 0
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
+
+.set VL, 32
+.set USE_AVX10, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
+
+.set VL, 64
+.set USE_AVX10, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
+#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 411d8c83e88a..eb153eff9331 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -10,16 +10,7 @@
* Vinodh Gopal <vinodh.gopal@intel.com>
* Kahraman Akdemir
*
- * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
- * interface for 64-bit kernels.
- * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
- * Aidan O'Mahony (aidan.o.mahony@intel.com)
- * Adrian Hoban <adrian.hoban@intel.com>
- * James Guilford (james.guilford@intel.com)
- * Gabriele Paoloni <gabriele.paoloni@intel.com>
- * Tadeusz Struk (tadeusz.struk@intel.com)
- * Wajdi Feghali (wajdi.k.feghali@intel.com)
- * Copyright (c) 2010, Intel Corporation.
+ * Copyright (c) 2010, Intel Corporation.
*
* Ported x86_64 version to x86:
* Author: Mathias Krause <minipli@googlemail.com>
@@ -27,103 +18,6 @@
#include <linux/linkage.h>
#include <asm/frame.h>
-#include <asm/nospec-branch.h>
-
-/*
- * The following macros are used to move an (un)aligned 16 byte value to/from
- * an XMM register. This can done for either FP or integer values, for FP use
- * movaps (move aligned packed single) or integer use movdqa (move double quad
- * aligned). It doesn't make a performance difference which instruction is used
- * since Nehalem (original Core i7) was released. However, the movaps is a byte
- * shorter, so that is the one we'll use for now. (same for unaligned).
- */
-#define MOVADQ movaps
-#define MOVUDQ movups
-
-#ifdef __x86_64__
-
-# constants in mergeable sections, linker can reorder and merge
-.section .rodata.cst16.POLY, "aM", @progbits, 16
-.align 16
-POLY: .octa 0xC2000000000000000000000000000001
-.section .rodata.cst16.TWOONE, "aM", @progbits, 16
-.align 16
-TWOONE: .octa 0x00000001000000000000000000000001
-
-.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
-.align 16
-SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
-.section .rodata.cst16.MASK1, "aM", @progbits, 16
-.align 16
-MASK1: .octa 0x0000000000000000ffffffffffffffff
-.section .rodata.cst16.MASK2, "aM", @progbits, 16
-.align 16
-MASK2: .octa 0xffffffffffffffff0000000000000000
-.section .rodata.cst16.ONE, "aM", @progbits, 16
-.align 16
-ONE: .octa 0x00000000000000000000000000000001
-.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
-.align 16
-F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
-.section .rodata.cst16.dec, "aM", @progbits, 16
-.align 16
-dec: .octa 0x1
-.section .rodata.cst16.enc, "aM", @progbits, 16
-.align 16
-enc: .octa 0x2
-
-# order of these constants should not change.
-# more specifically, ALL_F should follow SHIFT_MASK,
-# and zero should follow ALL_F
-.section .rodata, "a", @progbits
-.align 16
-SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
-ALL_F: .octa 0xffffffffffffffffffffffffffffffff
- .octa 0x00000000000000000000000000000000
-
-.text
-
-
-#define STACK_OFFSET 8*3
-
-#define AadHash 16*0
-#define AadLen 16*1
-#define InLen (16*1)+8
-#define PBlockEncKey 16*2
-#define OrigIV 16*3
-#define CurCount 16*4
-#define PBlockLen 16*5
-#define HashKey 16*6 // store HashKey <<1 mod poly here
-#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
-#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
-#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
-#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
- // bits of HashKey <<1 mod poly here
- //(for Karatsuba purposes)
-#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
- // bits of HashKey^2 <<1 mod poly here
- // (for Karatsuba purposes)
-#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
- // bits of HashKey^3 <<1 mod poly here
- // (for Karatsuba purposes)
-#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
- // bits of HashKey^4 <<1 mod poly here
- // (for Karatsuba purposes)
-
-#define arg1 rdi
-#define arg2 rsi
-#define arg3 rdx
-#define arg4 rcx
-#define arg5 r8
-#define arg6 r9
-#define arg7 STACK_OFFSET+8(%rsp)
-#define arg8 STACK_OFFSET+16(%rsp)
-#define arg9 STACK_OFFSET+24(%rsp)
-#define arg10 STACK_OFFSET+32(%rsp)
-#define arg11 STACK_OFFSET+40(%rsp)
-#define keysize 2*15*16(%arg1)
-#endif
-
#define STATE1 %xmm0
#define STATE2 %xmm4
@@ -170,1587 +64,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
#define TKEYP T1
#endif
-.macro FUNC_SAVE
- push %r12
- push %r13
- push %r14
-#
-# states of %xmm registers %xmm6:%xmm15 not saved
-# all %xmm registers are clobbered
-#
-.endm
-
-
-.macro FUNC_RESTORE
- pop %r14
- pop %r13
- pop %r12
-.endm
-
-# Precompute hashkeys.
-# Input: Hash subkey.
-# Output: HashKeys stored in gcm_context_data. Only needs to be called
-# once per key.
-# clobbers r12, and tmp xmm registers.
-.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
- mov \SUBKEY, %r12
- movdqu (%r12), \TMP3
- movdqa SHUF_MASK(%rip), \TMP2
- pshufb \TMP2, \TMP3
-
- # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
-
- movdqa \TMP3, \TMP2
- psllq $1, \TMP3
- psrlq $63, \TMP2
- movdqa \TMP2, \TMP1
- pslldq $8, \TMP2
- psrldq $8, \TMP1
- por \TMP2, \TMP3
-
- # reduce HashKey<<1
-
- pshufd $0x24, \TMP1, \TMP2
- pcmpeqd TWOONE(%rip), \TMP2
- pand POLY(%rip), \TMP2
- pxor \TMP2, \TMP3
- movdqu \TMP3, HashKey(%arg2)
-
- movdqa \TMP3, \TMP5
- pshufd $78, \TMP3, \TMP1
- pxor \TMP3, \TMP1
- movdqu \TMP1, HashKey_k(%arg2)
-
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^2<<1 (mod poly)
- movdqu \TMP5, HashKey_2(%arg2)
-# HashKey_2 = HashKey^2<<1 (mod poly)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqu \TMP1, HashKey_2_k(%arg2)
-
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^3<<1 (mod poly)
- movdqu \TMP5, HashKey_3(%arg2)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqu \TMP1, HashKey_3_k(%arg2)
-
- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^3<<1 (mod poly)
- movdqu \TMP5, HashKey_4(%arg2)
- pshufd $78, \TMP5, \TMP1
- pxor \TMP5, \TMP1
- movdqu \TMP1, HashKey_4_k(%arg2)
-.endm
-
-# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
-# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
-.macro GCM_INIT Iv SUBKEY AAD AADLEN
- mov \AADLEN, %r11
- mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
- xor %r11d, %r11d
- mov %r11, InLen(%arg2) # ctx_data.in_length = 0
- mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
- mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
- mov \Iv, %rax
- movdqu (%rax), %xmm0
- movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
-
- movdqa SHUF_MASK(%rip), %xmm2
- pshufb %xmm2, %xmm0
- movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
-
- PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
- movdqu HashKey(%arg2), %xmm13
-
- CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
- %xmm4, %xmm5, %xmm6
-.endm
-
-# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
-# struct has been initialized by GCM_INIT.
-# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
-# Clobbers rax, r10-r13, and xmm0-xmm15
-.macro GCM_ENC_DEC operation
- movdqu AadHash(%arg2), %xmm8
- movdqu HashKey(%arg2), %xmm13
- add %arg5, InLen(%arg2)
-
- xor %r11d, %r11d # initialise the data pointer offset as zero
- PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
-
- sub %r11, %arg5 # sub partial block data used
- mov %arg5, %r13 # save the number of bytes
-
- and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
- mov %r13, %r12
- # Encrypt/Decrypt first few blocks
-
- and $(3<<4), %r12
- jz .L_initial_num_blocks_is_0_\@
- cmp $(2<<4), %r12
- jb .L_initial_num_blocks_is_1_\@
- je .L_initial_num_blocks_is_2_\@
-.L_initial_num_blocks_is_3_\@:
- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
- sub $48, %r13
- jmp .L_initial_blocks_\@
-.L_initial_num_blocks_is_2_\@:
- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
- sub $32, %r13
- jmp .L_initial_blocks_\@
-.L_initial_num_blocks_is_1_\@:
- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
- sub $16, %r13
- jmp .L_initial_blocks_\@
-.L_initial_num_blocks_is_0_\@:
- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
-.L_initial_blocks_\@:
-
- # Main loop - Encrypt/Decrypt remaining blocks
-
- test %r13, %r13
- je .L_zero_cipher_left_\@
- sub $64, %r13
- je .L_four_cipher_left_\@
-.L_crypt_by_4_\@:
- GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
- %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
- %xmm7, %xmm8, enc
- add $64, %r11
- sub $64, %r13
- jne .L_crypt_by_4_\@
-.L_four_cipher_left_\@:
- GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
-%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
-.L_zero_cipher_left_\@:
- movdqu %xmm8, AadHash(%arg2)
- movdqu %xmm0, CurCount(%arg2)
-
- mov %arg5, %r13
- and $15, %r13 # %r13 = arg5 (mod 16)
- je .L_multiple_of_16_bytes_\@
-
- mov %r13, PBlockLen(%arg2)
-
- # Handle the last <16 Byte block separately
- paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
- movdqu %xmm0, CurCount(%arg2)
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10, %xmm0
-
- ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
- movdqu %xmm0, PBlockEncKey(%arg2)
-
- cmp $16, %arg5
- jge .L_large_enough_update_\@
-
- lea (%arg4,%r11,1), %r10
- mov %r13, %r12
- READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
- jmp .L_data_read_\@
-
-.L_large_enough_update_\@:
- sub $16, %r11
- add %r13, %r11
-
- # receive the last <16 Byte block
- movdqu (%arg4, %r11, 1), %xmm1
-
- sub %r13, %r11
- add $16, %r11
-
- lea SHIFT_MASK+16(%rip), %r12
- # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
- # (r13 is the number of bytes in plaintext mod 16)
- sub %r13, %r12
- # get the appropriate shuffle mask
- movdqu (%r12), %xmm2
- # shift right 16-r13 bytes
- pshufb %xmm2, %xmm1
-
-.L_data_read_\@:
- lea ALL_F+16(%rip), %r12
- sub %r13, %r12
-
-.ifc \operation, dec
- movdqa %xmm1, %xmm2
-.endif
- pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
- movdqu (%r12), %xmm1
- # get the appropriate mask to mask out top 16-r13 bytes of xmm0
- pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
-.ifc \operation, dec
- pand %xmm1, %xmm2
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10 ,%xmm2
-
- pxor %xmm2, %xmm8
-.else
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10,%xmm0
-
- pxor %xmm0, %xmm8
-.endif
-
- movdqu %xmm8, AadHash(%arg2)
-.ifc \operation, enc
- # GHASH computation for the last <16 byte block
- movdqa SHUF_MASK(%rip), %xmm10
- # shuffle xmm0 back to output as ciphertext
- pshufb %xmm10, %xmm0
-.endif
-
- # Output %r13 bytes
- movq %xmm0, %rax
- cmp $8, %r13
- jle .L_less_than_8_bytes_left_\@
- mov %rax, (%arg3 , %r11, 1)
- add $8, %r11
- psrldq $8, %xmm0
- movq %xmm0, %rax
- sub $8, %r13
-.L_less_than_8_bytes_left_\@:
- mov %al, (%arg3, %r11, 1)
- add $1, %r11
- shr $8, %rax
- sub $1, %r13
- jne .L_less_than_8_bytes_left_\@
-.L_multiple_of_16_bytes_\@:
-.endm
-
-# GCM_COMPLETE Finishes update of tag of last partial block
-# Output: Authorization Tag (AUTH_TAG)
-# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
-.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
- movdqu AadHash(%arg2), %xmm8
- movdqu HashKey(%arg2), %xmm13
-
- mov PBlockLen(%arg2), %r12
-
- test %r12, %r12
- je .L_partial_done\@
-
- GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-
-.L_partial_done\@:
- mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
- shl $3, %r12 # convert into number of bits
- movd %r12d, %xmm15 # len(A) in %xmm15
- mov InLen(%arg2), %r12
- shl $3, %r12 # len(C) in bits (*128)
- movq %r12, %xmm1
-
- pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
- pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
- pxor %xmm15, %xmm8
- GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
- # final GHASH computation
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10, %xmm8
-
- movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
- ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
- pxor %xmm8, %xmm0
-.L_return_T_\@:
- mov \AUTHTAG, %r10 # %r10 = authTag
- mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
- cmp $16, %r11
- je .L_T_16_\@
- cmp $8, %r11
- jl .L_T_4_\@
-.L_T_8_\@:
- movq %xmm0, %rax
- mov %rax, (%r10)
- add $8, %r10
- sub $8, %r11
- psrldq $8, %xmm0
- test %r11, %r11
- je .L_return_T_done_\@
-.L_T_4_\@:
- movd %xmm0, %eax
- mov %eax, (%r10)
- add $4, %r10
- sub $4, %r11
- psrldq $4, %xmm0
- test %r11, %r11
- je .L_return_T_done_\@
-.L_T_123_\@:
- movd %xmm0, %eax
- cmp $2, %r11
- jl .L_T_1_\@
- mov %ax, (%r10)
- cmp $2, %r11
- je .L_return_T_done_\@
- add $2, %r10
- sar $16, %eax
-.L_T_1_\@:
- mov %al, (%r10)
- jmp .L_return_T_done_\@
-.L_T_16_\@:
- movdqu %xmm0, (%r10)
-.L_return_T_done_\@:
-.endm
-
-#ifdef __x86_64__
-/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-*
-*
-* Input: A and B (128-bits each, bit-reflected)
-* Output: C = A*B*x mod poly, (i.e. >>1 )
-* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
-* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
-*
-*/
-.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
- movdqa \GH, \TMP1
- pshufd $78, \GH, \TMP2
- pshufd $78, \HK, \TMP3
- pxor \GH, \TMP2 # TMP2 = a1+a0
- pxor \HK, \TMP3 # TMP3 = b1+b0
- pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
- pclmulqdq $0x00, \HK, \GH # GH = a0*b0
- pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
- pxor \GH, \TMP2
- pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
- movdqa \TMP2, \TMP3
- pslldq $8, \TMP3 # left shift TMP3 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP3, \GH
- pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
-
- # first phase of the reduction
-
- movdqa \GH, \TMP2
- movdqa \GH, \TMP3
- movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
- # in in order to perform
- # independent shifts
- pslld $31, \TMP2 # packed right shift <<31
- pslld $30, \TMP3 # packed right shift <<30
- pslld $25, \TMP4 # packed right shift <<25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP5
- psrldq $4, \TMP5 # right shift TMP5 1 DW
- pslldq $12, \TMP2 # left shift TMP2 3 DWs
- pxor \TMP2, \GH
-
- # second phase of the reduction
-
- movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
- # in in order to perform
- # independent shifts
- movdqa \GH,\TMP3
- movdqa \GH,\TMP4
- psrld $1,\TMP2 # packed left shift >>1
- psrld $2,\TMP3 # packed left shift >>2
- psrld $7,\TMP4 # packed left shift >>7
- pxor \TMP3,\TMP2 # xor the shifted versions
- pxor \TMP4,\TMP2
- pxor \TMP5, \TMP2
- pxor \TMP2, \GH
- pxor \TMP1, \GH # result is in TMP1
-.endm
-
-# Reads DLEN bytes starting at DPTR and stores in XMMDst
-# where 0 < DLEN < 16
-# Clobbers %rax, DLEN and XMM1
-.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
- cmp $8, \DLEN
- jl .L_read_lt8_\@
- mov (\DPTR), %rax
- movq %rax, \XMMDst
- sub $8, \DLEN
- jz .L_done_read_partial_block_\@
- xor %eax, %eax
-.L_read_next_byte_\@:
- shl $8, %rax
- mov 7(\DPTR, \DLEN, 1), %al
- dec \DLEN
- jnz .L_read_next_byte_\@
- movq %rax, \XMM1
- pslldq $8, \XMM1
- por \XMM1, \XMMDst
- jmp .L_done_read_partial_block_\@
-.L_read_lt8_\@:
- xor %eax, %eax
-.L_read_next_byte_lt8_\@:
- shl $8, %rax
- mov -1(\DPTR, \DLEN, 1), %al
- dec \DLEN
- jnz .L_read_next_byte_lt8_\@
- movq %rax, \XMMDst
-.L_done_read_partial_block_\@:
-.endm
-
-# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
-# clobbers r10-11, xmm14
-.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
- TMP6 TMP7
- MOVADQ SHUF_MASK(%rip), %xmm14
- mov \AAD, %r10 # %r10 = AAD
- mov \AADLEN, %r11 # %r11 = aadLen
- pxor \TMP7, \TMP7
- pxor \TMP6, \TMP6
-
- cmp $16, %r11
- jl .L_get_AAD_rest\@
-.L_get_AAD_blocks\@:
- movdqu (%r10), \TMP7
- pshufb %xmm14, \TMP7 # byte-reflect the AAD data
- pxor \TMP7, \TMP6
- GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
- add $16, %r10
- sub $16, %r11
- cmp $16, %r11
- jge .L_get_AAD_blocks\@
-
- movdqu \TMP6, \TMP7
-
- /* read the last <16B of AAD */
-.L_get_AAD_rest\@:
- test %r11, %r11
- je .L_get_AAD_done\@
-
- READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
- pshufb %xmm14, \TMP7 # byte-reflect the AAD data
- pxor \TMP6, \TMP7
- GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
- movdqu \TMP7, \TMP6
-
-.L_get_AAD_done\@:
- movdqu \TMP6, AadHash(%arg2)
-.endm
-
-# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
-# between update calls.
-# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
-# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
-# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
-.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
- AAD_HASH operation
- mov PBlockLen(%arg2), %r13
- test %r13, %r13
- je .L_partial_block_done_\@ # Leave Macro if no partial blocks
- # Read in input data without over reading
- cmp $16, \PLAIN_CYPH_LEN
- jl .L_fewer_than_16_bytes_\@
- movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
- jmp .L_data_read_\@
-
-.L_fewer_than_16_bytes_\@:
- lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
- mov \PLAIN_CYPH_LEN, %r12
- READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
-
- mov PBlockLen(%arg2), %r13
-
-.L_data_read_\@: # Finished reading in data
-
- movdqu PBlockEncKey(%arg2), %xmm9
- movdqu HashKey(%arg2), %xmm13
-
- lea SHIFT_MASK(%rip), %r12
-
- # adjust the shuffle mask pointer to be able to shift r13 bytes
- # r16-r13 is the number of bytes in plaintext mod 16)
- add %r13, %r12
- movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- pshufb %xmm2, %xmm9 # shift right r13 bytes
-
-.ifc \operation, dec
- movdqa %xmm1, %xmm3
- pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
-
- mov \PLAIN_CYPH_LEN, %r10
- add %r13, %r10
- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
- sub $16, %r10
- # Determine if partial block is not being filled and
- # shift mask accordingly
- jge .L_no_extra_mask_1_\@
- sub %r10, %r12
-.L_no_extra_mask_1_\@:
-
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
- # get the appropriate mask to mask out bottom r13 bytes of xmm9
- pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
-
- pand %xmm1, %xmm3
- movdqa SHUF_MASK(%rip), %xmm10
- pshufb %xmm10, %xmm3
- pshufb %xmm2, %xmm3
- pxor %xmm3, \AAD_HASH
-
- test %r10, %r10
- jl .L_partial_incomplete_1_\@
-
- # GHASH computation for the last <16 Byte block
- GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %eax, %eax
-
- mov %rax, PBlockLen(%arg2)
- jmp .L_dec_done_\@
-.L_partial_incomplete_1_\@:
- add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
-.L_dec_done_\@:
- movdqu \AAD_HASH, AadHash(%arg2)
-.else
- pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
-
- mov \PLAIN_CYPH_LEN, %r10
- add %r13, %r10
- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
- sub $16, %r10
- # Determine if partial block is not being filled and
- # shift mask accordingly
- jge .L_no_extra_mask_2_\@
- sub %r10, %r12
-.L_no_extra_mask_2_\@:
-
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
- # get the appropriate mask to mask out bottom r13 bytes of xmm9
- pand %xmm1, %xmm9
-
- movdqa SHUF_MASK(%rip), %xmm1
- pshufb %xmm1, %xmm9
- pshufb %xmm2, %xmm9
- pxor %xmm9, \AAD_HASH
-
- test %r10, %r10
- jl .L_partial_incomplete_2_\@
-
- # GHASH computation for the last <16 Byte block
- GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %eax, %eax
-
- mov %rax, PBlockLen(%arg2)
- jmp .L_encode_done_\@
-.L_partial_incomplete_2_\@:
- add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
-.L_encode_done_\@:
- movdqu \AAD_HASH, AadHash(%arg2)
-
- movdqa SHUF_MASK(%rip), %xmm10
- # shuffle xmm9 back to output as ciphertext
- pshufb %xmm10, %xmm9
- pshufb %xmm2, %xmm9
-.endif
- # output encrypted Bytes
- test %r10, %r10
- jl .L_partial_fill_\@
- mov %r13, %r12
- mov $16, %r13
- # Set r13 to be the number of bytes to write out
- sub %r12, %r13
- jmp .L_count_set_\@
-.L_partial_fill_\@:
- mov \PLAIN_CYPH_LEN, %r13
-.L_count_set_\@:
- movdqa %xmm9, %xmm0
- movq %xmm0, %rax
- cmp $8, %r13
- jle .L_less_than_8_bytes_left_\@
-
- mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
- add $8, \DATA_OFFSET
- psrldq $8, %xmm0
- movq %xmm0, %rax
- sub $8, %r13
-.L_less_than_8_bytes_left_\@:
- movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
- add $1, \DATA_OFFSET
- shr $8, %rax
- sub $1, %r13
- jne .L_less_than_8_bytes_left_\@
-.L_partial_block_done_\@:
-.endm # PARTIAL_BLOCK
-
-/*
-* if a = number of total plaintext bytes
-* b = floor(a/16)
-* num_initial_blocks = b mod 4
-* encrypt the initial num_initial_blocks blocks and apply ghash on
-* the ciphertext
-* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
-* are clobbered
-* arg1, %arg2, %arg3 are used as a pointer only, not modified
-*/
-
-
-.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
- XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
- MOVADQ SHUF_MASK(%rip), %xmm14
-
- movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
-
- # start AES for num_initial_blocks blocks
-
- movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
-
-.if (\i == 5) || (\i == 6) || (\i == 7)
-
- MOVADQ ONE(%RIP),\TMP1
- MOVADQ 0(%arg1),\TMP2
-.irpc index, \i_seq
- paddd \TMP1, \XMM0 # INCR Y0
-.ifc \operation, dec
- movdqa \XMM0, %xmm\index
-.else
- MOVADQ \XMM0, %xmm\index
-.endif
- pshufb %xmm14, %xmm\index # perform a 16 byte swap
- pxor \TMP2, %xmm\index
-.endr
- lea 0x10(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- add $5,%eax # 128->9, 192->11, 256->13
-
-.Laes_loop_initial_\@:
- MOVADQ (%r10),\TMP1
-.irpc index, \i_seq
- aesenc \TMP1, %xmm\index
-.endr
- add $16,%r10
- sub $1,%eax
- jnz .Laes_loop_initial_\@
-
- MOVADQ (%r10), \TMP1
-.irpc index, \i_seq
- aesenclast \TMP1, %xmm\index # Last Round
-.endr
-.irpc index, \i_seq
- movdqu (%arg4 , %r11, 1), \TMP1
- pxor \TMP1, %xmm\index
- movdqu %xmm\index, (%arg3 , %r11, 1)
- # write back plaintext/ciphertext for num_initial_blocks
- add $16, %r11
-
-.ifc \operation, dec
- movdqa \TMP1, %xmm\index
-.endif
- pshufb %xmm14, %xmm\index
-
- # prepare plaintext/ciphertext for GHASH computation
-.endr
-.endif
-
- # apply GHASH on num_initial_blocks blocks
-
-.if \i == 5
- pxor %xmm5, %xmm6
- GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm6, %xmm7
- GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 6
- pxor %xmm6, %xmm7
- GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 7
- pxor %xmm7, %xmm8
- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.endif
- cmp $64, %r13
- jl .L_initial_blocks_done\@
- # no need for precomputed values
-/*
-*
-* Precomputations for HashKey parallel with encryption of first 4 blocks.
-* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-*/
- MOVADQ ONE(%RIP),\TMP1
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM1
- pshufb %xmm14, \XMM1 # perform a 16 byte swap
-
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM2
- pshufb %xmm14, \XMM2 # perform a 16 byte swap
-
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM3
- pshufb %xmm14, \XMM3 # perform a 16 byte swap
-
- paddd \TMP1, \XMM0 # INCR Y0
- MOVADQ \XMM0, \XMM4
- pshufb %xmm14, \XMM4 # perform a 16 byte swap
-
- MOVADQ 0(%arg1),\TMP1
- pxor \TMP1, \XMM1
- pxor \TMP1, \XMM2
- pxor \TMP1, \XMM3
- pxor \TMP1, \XMM4
-.irpc index, 1234 # do 4 rounds
- movaps 0x10*\index(%arg1), \TMP1
- aesenc \TMP1, \XMM1
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
-.endr
-.irpc index, 56789 # do next 5 rounds
- movaps 0x10*\index(%arg1), \TMP1
- aesenc \TMP1, \XMM1
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
-.endr
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz .Laes_loop_pre_done\@
-
-.Laes_loop_pre_\@:
- MOVADQ (%r10),\TMP2
-.irpc index, 1234
- aesenc \TMP2, %xmm\index
-.endr
- add $16,%r10
- sub $1,%eax
- jnz .Laes_loop_pre_\@
-
-.Laes_loop_pre_done\@:
- MOVADQ (%r10), \TMP2
- aesenclast \TMP2, \XMM1
- aesenclast \TMP2, \XMM2
- aesenclast \TMP2, \XMM3
- aesenclast \TMP2, \XMM4
- movdqu 16*0(%arg4 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM1
-.ifc \operation, dec
- movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
- movdqa \TMP1, \XMM1
-.endif
- movdqu 16*1(%arg4 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM2
-.ifc \operation, dec
- movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
- movdqa \TMP1, \XMM2
-.endif
- movdqu 16*2(%arg4 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM3
-.ifc \operation, dec
- movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
- movdqa \TMP1, \XMM3
-.endif
- movdqu 16*3(%arg4 , %r11 , 1), \TMP1
- pxor \TMP1, \XMM4
-.ifc \operation, dec
- movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
- movdqa \TMP1, \XMM4
-.else
- movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
- movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
- movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
- movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
-.endif
-
- add $64, %r11
- pshufb %xmm14, \XMM1 # perform a 16 byte swap
- pxor \XMMDst, \XMM1
-# combine GHASHed value with the corresponding ciphertext
- pshufb %xmm14, \XMM2 # perform a 16 byte swap
- pshufb %xmm14, \XMM3 # perform a 16 byte swap
- pshufb %xmm14, \XMM4 # perform a 16 byte swap
-
-.L_initial_blocks_done\@:
-
-.endm
-
-/*
-* encrypt 4 blocks at a time
-* ghash the 4 previously encrypted ciphertext blocks
-* arg1, %arg3, %arg4 are used as pointers only, not modified
-* %r11 is the data offset value
-*/
-.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
-TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
-
- movdqa \XMM1, \XMM5
- movdqa \XMM2, \XMM6
- movdqa \XMM3, \XMM7
- movdqa \XMM4, \XMM8
-
- movdqa SHUF_MASK(%rip), %xmm15
- # multiply TMP5 * HashKey using karatsuba
-
- movdqa \XMM5, \TMP4
- pshufd $78, \XMM5, \TMP6
- pxor \XMM5, \TMP6
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqu HashKey_4(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
- movdqa \XMM0, \XMM1
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM2
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM3
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM4
- pshufb %xmm15, \XMM1 # perform a 16 byte swap
- pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
- pshufb %xmm15, \XMM2 # perform a 16 byte swap
- pshufb %xmm15, \XMM3 # perform a 16 byte swap
- pshufb %xmm15, \XMM4 # perform a 16 byte swap
-
- pxor (%arg1), \XMM1
- pxor (%arg1), \XMM2
- pxor (%arg1), \XMM3
- pxor (%arg1), \XMM4
- movdqu HashKey_4_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
- movaps 0x10(%arg1), \TMP1
- aesenc \TMP1, \XMM1 # Round 1
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
- movaps 0x20(%arg1), \TMP1
- aesenc \TMP1, \XMM1 # Round 2
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
- movdqa \XMM6, \TMP1
- pshufd $78, \XMM6, \TMP2
- pxor \XMM6, \TMP2
- movdqu HashKey_3(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
- movaps 0x30(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 3
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
- movaps 0x40(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 4
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- movdqu HashKey_3_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x50(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 5
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pxor \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM6, \XMM5
- pxor \TMP2, \TMP6
- movdqa \XMM7, \TMP1
- pshufd $78, \XMM7, \TMP2
- pxor \XMM7, \TMP2
- movdqu HashKey_2(%arg2), \TMP5
-
- # Multiply TMP5 * HashKey using karatsuba
-
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x60(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 6
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
- movaps 0x70(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 7
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- movdqu HashKey_2_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x80(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 8
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pxor \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM7, \XMM5
- pxor \TMP2, \TMP6
-
- # Multiply XMM8 * HashKey
- # XMM8 and TMP5 hold the values for the two operands
-
- movdqa \XMM8, \TMP1
- pshufd $78, \XMM8, \TMP2
- pxor \XMM8, \TMP2
- movdqu HashKey(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x90(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 9
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz .Laes_loop_par_enc_done\@
-
-.Laes_loop_par_enc\@:
- MOVADQ (%r10),\TMP3
-.irpc index, 1234
- aesenc \TMP3, %xmm\index
-.endr
- add $16,%r10
- sub $1,%eax
- jnz .Laes_loop_par_enc\@
-
-.Laes_loop_par_enc_done\@:
- MOVADQ (%r10), \TMP3
- aesenclast \TMP3, \XMM1 # Round 10
- aesenclast \TMP3, \XMM2
- aesenclast \TMP3, \XMM3
- aesenclast \TMP3, \XMM4
- movdqu HashKey_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movdqu (%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
- movdqu 16(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
- movdqu 32(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
- movdqu 48(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
- movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
- movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
- movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
- movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
- pshufb %xmm15, \XMM1 # perform a 16 byte swap
- pshufb %xmm15, \XMM2 # perform a 16 byte swap
- pshufb %xmm15, \XMM3 # perform a 16 byte swap
- pshufb %xmm15, \XMM4 # perform a 16 byte swap
-
- pxor \TMP4, \TMP1
- pxor \XMM8, \XMM5
- pxor \TMP6, \TMP2
- pxor \TMP1, \TMP2
- pxor \XMM5, \TMP2
- movdqa \TMP2, \TMP3
- pslldq $8, \TMP3 # left shift TMP3 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP3, \XMM5
- pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
-
- # first phase of reduction
-
- movdqa \XMM5, \TMP2
- movdqa \XMM5, \TMP3
- movdqa \XMM5, \TMP4
-# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
- pslld $31, \TMP2 # packed right shift << 31
- pslld $30, \TMP3 # packed right shift << 30
- pslld $25, \TMP4 # packed right shift << 25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP5
- psrldq $4, \TMP5 # right shift T5 1 DW
- pslldq $12, \TMP2 # left shift T2 3 DWs
- pxor \TMP2, \XMM5
-
- # second phase of reduction
-
- movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
- movdqa \XMM5,\TMP3
- movdqa \XMM5,\TMP4
- psrld $1, \TMP2 # packed left shift >>1
- psrld $2, \TMP3 # packed left shift >>2
- psrld $7, \TMP4 # packed left shift >>7
- pxor \TMP3,\TMP2 # xor the shifted versions
- pxor \TMP4,\TMP2
- pxor \TMP5, \TMP2
- pxor \TMP2, \XMM5
- pxor \TMP1, \XMM5 # result is in TMP1
-
- pxor \XMM5, \XMM1
-.endm
-
-/*
-* decrypt 4 blocks at a time
-* ghash the 4 previously decrypted ciphertext blocks
-* arg1, %arg3, %arg4 are used as pointers only, not modified
-* %r11 is the data offset value
-*/
-.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
-TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
-
- movdqa \XMM1, \XMM5
- movdqa \XMM2, \XMM6
- movdqa \XMM3, \XMM7
- movdqa \XMM4, \XMM8
-
- movdqa SHUF_MASK(%rip), %xmm15
- # multiply TMP5 * HashKey using karatsuba
-
- movdqa \XMM5, \TMP4
- pshufd $78, \XMM5, \TMP6
- pxor \XMM5, \TMP6
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqu HashKey_4(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
- movdqa \XMM0, \XMM1
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM2
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM3
- paddd ONE(%rip), \XMM0 # INCR CNT
- movdqa \XMM0, \XMM4
- pshufb %xmm15, \XMM1 # perform a 16 byte swap
- pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
- pshufb %xmm15, \XMM2 # perform a 16 byte swap
- pshufb %xmm15, \XMM3 # perform a 16 byte swap
- pshufb %xmm15, \XMM4 # perform a 16 byte swap
-
- pxor (%arg1), \XMM1
- pxor (%arg1), \XMM2
- pxor (%arg1), \XMM3
- pxor (%arg1), \XMM4
- movdqu HashKey_4_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
- movaps 0x10(%arg1), \TMP1
- aesenc \TMP1, \XMM1 # Round 1
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
- movaps 0x20(%arg1), \TMP1
- aesenc \TMP1, \XMM1 # Round 2
- aesenc \TMP1, \XMM2
- aesenc \TMP1, \XMM3
- aesenc \TMP1, \XMM4
- movdqa \XMM6, \TMP1
- pshufd $78, \XMM6, \TMP2
- pxor \XMM6, \TMP2
- movdqu HashKey_3(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
- movaps 0x30(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 3
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
- movaps 0x40(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 4
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- movdqu HashKey_3_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x50(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 5
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pxor \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM6, \XMM5
- pxor \TMP2, \TMP6
- movdqa \XMM7, \TMP1
- pshufd $78, \XMM7, \TMP2
- pxor \XMM7, \TMP2
- movdqu HashKey_2(%arg2), \TMP5
-
- # Multiply TMP5 * HashKey using karatsuba
-
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x60(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 6
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
- movaps 0x70(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 7
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- movdqu HashKey_2_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movaps 0x80(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 8
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pxor \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
- pxor \XMM7, \XMM5
- pxor \TMP2, \TMP6
-
- # Multiply XMM8 * HashKey
- # XMM8 and TMP5 hold the values for the two operands
-
- movdqa \XMM8, \TMP1
- pshufd $78, \XMM8, \TMP2
- pxor \XMM8, \TMP2
- movdqu HashKey(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- movaps 0x90(%arg1), \TMP3
- aesenc \TMP3, \XMM1 # Round 9
- aesenc \TMP3, \XMM2
- aesenc \TMP3, \XMM3
- aesenc \TMP3, \XMM4
- pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
- lea 0xa0(%arg1),%r10
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- sub $4,%eax # 128->0, 192->2, 256->4
- jz .Laes_loop_par_dec_done\@
-
-.Laes_loop_par_dec\@:
- MOVADQ (%r10),\TMP3
-.irpc index, 1234
- aesenc \TMP3, %xmm\index
-.endr
- add $16,%r10
- sub $1,%eax
- jnz .Laes_loop_par_dec\@
-
-.Laes_loop_par_dec_done\@:
- MOVADQ (%r10), \TMP3
- aesenclast \TMP3, \XMM1 # last round
- aesenclast \TMP3, \XMM2
- aesenclast \TMP3, \XMM3
- aesenclast \TMP3, \XMM4
- movdqu HashKey_k(%arg2), \TMP5
- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movdqu (%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
- movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM1
- movdqu 16(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
- movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM2
- movdqu 32(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
- movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM3
- movdqu 48(%arg4,%r11,1), \TMP3
- pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
- movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
- movdqa \TMP3, \XMM4
- pshufb %xmm15, \XMM1 # perform a 16 byte swap
- pshufb %xmm15, \XMM2 # perform a 16 byte swap
- pshufb %xmm15, \XMM3 # perform a 16 byte swap
- pshufb %xmm15, \XMM4 # perform a 16 byte swap
-
- pxor \TMP4, \TMP1
- pxor \XMM8, \XMM5
- pxor \TMP6, \TMP2
- pxor \TMP1, \TMP2
- pxor \XMM5, \TMP2
- movdqa \TMP2, \TMP3
- pslldq $8, \TMP3 # left shift TMP3 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP3, \XMM5
- pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
-
- # first phase of reduction
-
- movdqa \XMM5, \TMP2
- movdqa \XMM5, \TMP3
- movdqa \XMM5, \TMP4
-# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
- pslld $31, \TMP2 # packed right shift << 31
- pslld $30, \TMP3 # packed right shift << 30
- pslld $25, \TMP4 # packed right shift << 25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP5
- psrldq $4, \TMP5 # right shift T5 1 DW
- pslldq $12, \TMP2 # left shift T2 3 DWs
- pxor \TMP2, \XMM5
-
- # second phase of reduction
-
- movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
- movdqa \XMM5,\TMP3
- movdqa \XMM5,\TMP4
- psrld $1, \TMP2 # packed left shift >>1
- psrld $2, \TMP3 # packed left shift >>2
- psrld $7, \TMP4 # packed left shift >>7
- pxor \TMP3,\TMP2 # xor the shifted versions
- pxor \TMP4,\TMP2
- pxor \TMP5, \TMP2
- pxor \TMP2, \XMM5
- pxor \TMP1, \XMM5 # result is in TMP1
-
- pxor \XMM5, \XMM1
-.endm
-
-/* GHASH the last 4 ciphertext blocks. */
-.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
-TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
-
- # Multiply TMP6 * HashKey (using Karatsuba)
-
- movdqa \XMM1, \TMP6
- pshufd $78, \XMM1, \TMP2
- pxor \XMM1, \TMP2
- movdqu HashKey_4(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
- pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
- movdqu HashKey_4_k(%arg2), \TMP4
- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- movdqa \XMM1, \XMMDst
- movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
-
- # Multiply TMP1 * HashKey (using Karatsuba)
-
- movdqa \XMM2, \TMP1
- pshufd $78, \XMM2, \TMP2
- pxor \XMM2, \TMP2
- movdqu HashKey_3(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
- movdqu HashKey_3_k(%arg2), \TMP4
- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- pxor \TMP1, \TMP6
- pxor \XMM2, \XMMDst
- pxor \TMP2, \XMM1
-# results accumulated in TMP6, XMMDst, XMM1
-
- # Multiply TMP1 * HashKey (using Karatsuba)
-
- movdqa \XMM3, \TMP1
- pshufd $78, \XMM3, \TMP2
- pxor \XMM3, \TMP2
- movdqu HashKey_2(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
- movdqu HashKey_2_k(%arg2), \TMP4
- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- pxor \TMP1, \TMP6
- pxor \XMM3, \XMMDst
- pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
-
- # Multiply TMP1 * HashKey (using Karatsuba)
- movdqa \XMM4, \TMP1
- pshufd $78, \XMM4, \TMP2
- pxor \XMM4, \TMP2
- movdqu HashKey(%arg2), \TMP5
- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
- pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
- movdqu HashKey_k(%arg2), \TMP4
- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
- pxor \TMP1, \TMP6
- pxor \XMM4, \XMMDst
- pxor \XMM1, \TMP2
- pxor \TMP6, \TMP2
- pxor \XMMDst, \TMP2
- # middle section of the temp results combined as in karatsuba algorithm
- movdqa \TMP2, \TMP4
- pslldq $8, \TMP4 # left shift TMP4 2 DWs
- psrldq $8, \TMP2 # right shift TMP2 2 DWs
- pxor \TMP4, \XMMDst
- pxor \TMP2, \TMP6
-# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
- # first phase of the reduction
- movdqa \XMMDst, \TMP2
- movdqa \XMMDst, \TMP3
- movdqa \XMMDst, \TMP4
-# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
- pslld $31, \TMP2 # packed right shifting << 31
- pslld $30, \TMP3 # packed right shifting << 30
- pslld $25, \TMP4 # packed right shifting << 25
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- movdqa \TMP2, \TMP7
- psrldq $4, \TMP7 # right shift TMP7 1 DW
- pslldq $12, \TMP2 # left shift TMP2 3 DWs
- pxor \TMP2, \XMMDst
-
- # second phase of the reduction
- movdqa \XMMDst, \TMP2
- # make 3 copies of XMMDst for doing 3 shift operations
- movdqa \XMMDst, \TMP3
- movdqa \XMMDst, \TMP4
- psrld $1, \TMP2 # packed left shift >> 1
- psrld $2, \TMP3 # packed left shift >> 2
- psrld $7, \TMP4 # packed left shift >> 7
- pxor \TMP3, \TMP2 # xor the shifted versions
- pxor \TMP4, \TMP2
- pxor \TMP7, \TMP2
- pxor \TMP2, \XMMDst
- pxor \TMP6, \XMMDst # reduced result is in XMMDst
-.endm
-
-
-/* Encryption of a single block
-* uses eax & r10
-*/
-
-.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
-
- pxor (%arg1), \XMM0
- mov keysize,%eax
- shr $2,%eax # 128->4, 192->6, 256->8
- add $5,%eax # 128->9, 192->11, 256->13
- lea 16(%arg1), %r10 # get first expanded key address
-
-_esb_loop_\@:
- MOVADQ (%r10),\TMP1
- aesenc \TMP1,\XMM0
- add $16,%r10
- sub $1,%eax
- jnz _esb_loop_\@
-
- MOVADQ (%r10),\TMP1
- aesenclast \TMP1,\XMM0
-.endm
-/*****************************************************************************
-* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data
-* // Context data
-* u8 *out, // Plaintext output. Encrypt in-place is allowed.
-* const u8 *in, // Ciphertext input
-* u64 plaintext_len, // Length of data in bytes for decryption.
-* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
-* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
-* // concatenated with 0x00000001. 16-byte aligned pointer.
-* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
-* const u8 *aad, // Additional Authentication Data (AAD)
-* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
-* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
-* // given authentication tag and only return the plaintext if they match.
-* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
-* // (most likely), 12 or 8.
-*
-* Assumptions:
-*
-* keys:
-* keys are pre-expanded and aligned to 16 bytes. we are using the first
-* set of 11 keys in the data structure void *aes_ctx
-*
-* iv:
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Salt (From the SA) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Initialization Vector |
-* | (This is the sequence number from IPSec header) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x1 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-*
-*
-* AAD:
-* AAD padded to 128 bits with 0
-* for example, assume AAD is a u32 vector
-*
-* if AAD is 8 bytes:
-* AAD[3] = {A0, A1};
-* padded AAD in xmm register = {A1 A0 0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A1) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 32-bit Sequence Number (A0) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 32-bit Sequence Number
-*
-* if AAD is 12 bytes:
-* AAD[3] = {A0, A1, A2};
-* padded AAD in xmm register = {A2 A1 A0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A2) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 64-bit Extended Sequence Number {A1,A0} |
-* | |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 64-bit Extended Sequence Number
-*
-* poly = x^128 + x^127 + x^126 + x^121 + 1
-*
-*****************************************************************************/
-SYM_FUNC_START(aesni_gcm_dec)
- FUNC_SAVE
-
- GCM_INIT %arg6, arg7, arg8, arg9
- GCM_ENC_DEC dec
- GCM_COMPLETE arg10, arg11
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_dec)
-
-
-/*****************************************************************************
-* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data
-* // Context data
-* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
-* const u8 *in, // Plaintext input
-* u64 plaintext_len, // Length of data in bytes for encryption.
-* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
-* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
-* // concatenated with 0x00000001. 16-byte aligned pointer.
-* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
-* const u8 *aad, // Additional Authentication Data (AAD)
-* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
-* u8 *auth_tag, // Authenticated Tag output.
-* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
-* // 12 or 8.
-*
-* Assumptions:
-*
-* keys:
-* keys are pre-expanded and aligned to 16 bytes. we are using the
-* first set of 11 keys in the data structure void *aes_ctx
-*
-*
-* iv:
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Salt (From the SA) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Initialization Vector |
-* | (This is the sequence number from IPSec header) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x1 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-*
-*
-* AAD:
-* AAD padded to 128 bits with 0
-* for example, assume AAD is a u32 vector
-*
-* if AAD is 8 bytes:
-* AAD[3] = {A0, A1};
-* padded AAD in xmm register = {A1 A0 0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A1) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 32-bit Sequence Number (A0) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 32-bit Sequence Number
-*
-* if AAD is 12 bytes:
-* AAD[3] = {A0, A1, A2};
-* padded AAD in xmm register = {A2 A1 A0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A2) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 64-bit Extended Sequence Number {A1,A0} |
-* | |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 64-bit Extended Sequence Number
-*
-* poly = x^128 + x^127 + x^126 + x^121 + 1
-***************************************************************************/
-SYM_FUNC_START(aesni_gcm_enc)
- FUNC_SAVE
-
- GCM_INIT %arg6, arg7, arg8, arg9
- GCM_ENC_DEC enc
-
- GCM_COMPLETE arg10, arg11
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_enc)
-
-/*****************************************************************************
-* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data,
-* // context data
-* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
-* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
-* // concatenated with 0x00000001. 16-byte aligned pointer.
-* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
-* const u8 *aad, // Additional Authentication Data (AAD)
-* u64 aad_len) // Length of AAD in bytes.
-*/
-SYM_FUNC_START(aesni_gcm_init)
- FUNC_SAVE
- GCM_INIT %arg3, %arg4,%arg5, %arg6
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_init)
-
-/*****************************************************************************
-* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data,
-* // context data
-* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
-* const u8 *in, // Plaintext input
-* u64 plaintext_len, // Length of data in bytes for encryption.
-*/
-SYM_FUNC_START(aesni_gcm_enc_update)
- FUNC_SAVE
- GCM_ENC_DEC enc
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_enc_update)
-
-/*****************************************************************************
-* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data,
-* // context data
-* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
-* const u8 *in, // Plaintext input
-* u64 plaintext_len, // Length of data in bytes for encryption.
-*/
-SYM_FUNC_START(aesni_gcm_dec_update)
- FUNC_SAVE
- GCM_ENC_DEC dec
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_dec_update)
-
-/*****************************************************************************
-* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data,
-* // context data
-* u8 *auth_tag, // Authenticated Tag output.
-* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
-* // 12 or 8.
-*/
-SYM_FUNC_START(aesni_gcm_finalize)
- FUNC_SAVE
- GCM_COMPLETE %arg3 %arg4
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_finalize)
-
-#endif
-
SYM_FUNC_START_LOCAL(_key_expansion_256a)
pshufd $0b11111111, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
@@ -1820,8 +133,8 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b)
SYM_FUNC_END(_key_expansion_256b)
/*
- * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
- * unsigned int key_len)
+ * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ * unsigned int key_len)
*/
SYM_FUNC_START(aesni_set_key)
FRAME_BEGIN
@@ -1926,7 +239,6 @@ SYM_FUNC_START(aesni_set_key)
sub $0x10, UKEYP
cmp TKEYP, KEYP
jb .Ldec_key_loop
- xor AREG, AREG
#ifndef __x86_64__
popl KEYP
#endif
@@ -2826,28 +1138,24 @@ SYM_FUNC_END(aesni_ctr_enc)
.previous
/*
- * _aesni_gf128mul_x_ble: internal ABI
- * Multiply in GF(2^128) for XTS IVs
+ * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
* input:
* IV: current IV
* GF128MUL_MASK == mask with 0x87 and 0x01
* output:
* IV: next IV
* changed:
- * CTR: == temporary value
+ * KEY: == temporary value
*/
-#define _aesni_gf128mul_x_ble() \
- pshufd $0x13, IV, KEY; \
- paddq IV, IV; \
- psrad $31, KEY; \
- pand GF128MUL_MASK, KEY; \
- pxor KEY, IV;
+.macro _aesni_gf128mul_x_ble
+ pshufd $0x13, IV, KEY
+ paddq IV, IV
+ psrad $31, KEY
+ pand GF128MUL_MASK, KEY
+ pxor KEY, IV
+.endm
-/*
- * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
- * const u8 *src, unsigned int len, le128 *iv)
- */
-SYM_FUNC_START(aesni_xts_encrypt)
+.macro _aesni_xts_crypt enc
FRAME_BEGIN
#ifndef __x86_64__
pushl IVP
@@ -2866,35 +1174,46 @@ SYM_FUNC_START(aesni_xts_encrypt)
movups (IVP), IV
mov 480(KEYP), KLEN
+.if !\enc
+ add $240, KEYP
+
+ test $15, LEN
+ jz .Lxts_loop4\@
+ sub $16, LEN
+.endif
-.Lxts_enc_loop4:
+.Lxts_loop4\@:
sub $64, LEN
- jl .Lxts_enc_1x
+ jl .Lxts_1x\@
movdqa IV, STATE1
movdqu 0x00(INP), IN
pxor IN, STATE1
movdqu IV, 0x00(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
movdqa IV, STATE2
movdqu 0x10(INP), IN
pxor IN, STATE2
movdqu IV, 0x10(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
movdqa IV, STATE3
movdqu 0x20(INP), IN
pxor IN, STATE3
movdqu IV, 0x20(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
movdqa IV, STATE4
movdqu 0x30(INP), IN
pxor IN, STATE4
movdqu IV, 0x30(OUTP)
+.if \enc
call _aesni_enc4
+.else
+ call _aesni_dec4
+.endif
movdqu 0x00(OUTP), IN
pxor IN, STATE1
@@ -2912,17 +1231,17 @@ SYM_FUNC_START(aesni_xts_encrypt)
pxor IN, STATE4
movdqu STATE4, 0x30(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
add $64, INP
add $64, OUTP
test LEN, LEN
- jnz .Lxts_enc_loop4
+ jnz .Lxts_loop4\@
-.Lxts_enc_ret_iv:
+.Lxts_ret_iv\@:
movups IV, (IVP)
-.Lxts_enc_ret:
+.Lxts_ret\@:
#ifndef __x86_64__
popl KLEN
popl KEYP
@@ -2932,201 +1251,60 @@ SYM_FUNC_START(aesni_xts_encrypt)
FRAME_END
RET
-.Lxts_enc_1x:
+.Lxts_1x\@:
add $64, LEN
- jz .Lxts_enc_ret_iv
+ jz .Lxts_ret_iv\@
+.if \enc
sub $16, LEN
- jl .Lxts_enc_cts4
+ jl .Lxts_cts4\@
+.endif
-.Lxts_enc_loop1:
+.Lxts_loop1\@:
movdqu (INP), STATE
+.if \enc
pxor IV, STATE
call _aesni_enc1
- pxor IV, STATE
- _aesni_gf128mul_x_ble()
-
- test LEN, LEN
- jz .Lxts_enc_out
-
+.else
add $16, INP
sub $16, LEN
- jl .Lxts_enc_cts1
-
- movdqu STATE, (OUTP)
- add $16, OUTP
- jmp .Lxts_enc_loop1
-
-.Lxts_enc_out:
- movdqu STATE, (OUTP)
- jmp .Lxts_enc_ret_iv
-
-.Lxts_enc_cts4:
- movdqa STATE4, STATE
- sub $16, OUTP
-
-.Lxts_enc_cts1:
-#ifndef __x86_64__
- lea .Lcts_permute_table, T1
-#else
- lea .Lcts_permute_table(%rip), T1
-#endif
- add LEN, INP /* rewind input pointer */
- add $16, LEN /* # bytes in final block */
- movups (INP), IN1
-
- mov T1, IVP
- add $32, IVP
- add LEN, T1
- sub LEN, IVP
- add OUTP, LEN
-
- movups (T1), %xmm4
- movaps STATE, IN2
- pshufb %xmm4, STATE
- movups STATE, (LEN)
-
- movups (IVP), %xmm0
- pshufb %xmm0, IN1
- pblendvb IN2, IN1
- movaps IN1, STATE
-
+ jl .Lxts_cts1\@
pxor IV, STATE
- call _aesni_enc1
+ call _aesni_dec1
+.endif
pxor IV, STATE
+ _aesni_gf128mul_x_ble
- movups STATE, (OUTP)
- jmp .Lxts_enc_ret
-SYM_FUNC_END(aesni_xts_encrypt)
-
-/*
- * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
- * const u8 *src, unsigned int len, le128 *iv)
- */
-SYM_FUNC_START(aesni_xts_decrypt)
- FRAME_BEGIN
-#ifndef __x86_64__
- pushl IVP
- pushl LEN
- pushl KEYP
- pushl KLEN
- movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
- movl (FRAME_OFFSET+24)(%esp), OUTP # dst
- movl (FRAME_OFFSET+28)(%esp), INP # src
- movl (FRAME_OFFSET+32)(%esp), LEN # len
- movl (FRAME_OFFSET+36)(%esp), IVP # iv
- movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
-#else
- movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
-#endif
- movups (IVP), IV
-
- mov 480(KEYP), KLEN
- add $240, KEYP
-
- test $15, LEN
- jz .Lxts_dec_loop4
- sub $16, LEN
-
-.Lxts_dec_loop4:
- sub $64, LEN
- jl .Lxts_dec_1x
-
- movdqa IV, STATE1
- movdqu 0x00(INP), IN
- pxor IN, STATE1
- movdqu IV, 0x00(OUTP)
-
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE2
- movdqu 0x10(INP), IN
- pxor IN, STATE2
- movdqu IV, 0x10(OUTP)
-
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE3
- movdqu 0x20(INP), IN
- pxor IN, STATE3
- movdqu IV, 0x20(OUTP)
-
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE4
- movdqu 0x30(INP), IN
- pxor IN, STATE4
- movdqu IV, 0x30(OUTP)
-
- call _aesni_dec4
-
- movdqu 0x00(OUTP), IN
- pxor IN, STATE1
- movdqu STATE1, 0x00(OUTP)
-
- movdqu 0x10(OUTP), IN
- pxor IN, STATE2
- movdqu STATE2, 0x10(OUTP)
-
- movdqu 0x20(OUTP), IN
- pxor IN, STATE3
- movdqu STATE3, 0x20(OUTP)
-
- movdqu 0x30(OUTP), IN
- pxor IN, STATE4
- movdqu STATE4, 0x30(OUTP)
-
- _aesni_gf128mul_x_ble()
-
- add $64, INP
- add $64, OUTP
test LEN, LEN
- jnz .Lxts_dec_loop4
-
-.Lxts_dec_ret_iv:
- movups IV, (IVP)
-
-.Lxts_dec_ret:
-#ifndef __x86_64__
- popl KLEN
- popl KEYP
- popl LEN
- popl IVP
-#endif
- FRAME_END
- RET
-
-.Lxts_dec_1x:
- add $64, LEN
- jz .Lxts_dec_ret_iv
-
-.Lxts_dec_loop1:
- movdqu (INP), STATE
+ jz .Lxts_out\@
+.if \enc
add $16, INP
sub $16, LEN
- jl .Lxts_dec_cts1
-
- pxor IV, STATE
- call _aesni_dec1
- pxor IV, STATE
- _aesni_gf128mul_x_ble()
-
- test LEN, LEN
- jz .Lxts_dec_out
+ jl .Lxts_cts1\@
+.endif
movdqu STATE, (OUTP)
add $16, OUTP
- jmp .Lxts_dec_loop1
+ jmp .Lxts_loop1\@
-.Lxts_dec_out:
+.Lxts_out\@:
movdqu STATE, (OUTP)
- jmp .Lxts_dec_ret_iv
+ jmp .Lxts_ret_iv\@
-.Lxts_dec_cts1:
+.if \enc
+.Lxts_cts4\@:
+ movdqa STATE4, STATE
+ sub $16, OUTP
+.Lxts_cts1\@:
+.else
+.Lxts_cts1\@:
movdqa IV, STATE4
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
pxor IV, STATE
call _aesni_dec1
pxor IV, STATE
-
+.endif
#ifndef __x86_64__
lea .Lcts_permute_table, T1
#else
@@ -3152,10 +1330,32 @@ SYM_FUNC_START(aesni_xts_decrypt)
pblendvb IN2, IN1
movaps IN1, STATE
+.if \enc
+ pxor IV, STATE
+ call _aesni_enc1
+ pxor IV, STATE
+.else
pxor STATE4, STATE
call _aesni_dec1
pxor STATE4, STATE
+.endif
movups STATE, (OUTP)
- jmp .Lxts_dec_ret
-SYM_FUNC_END(aesni_xts_decrypt)
+ jmp .Lxts_ret\@
+.endm
+
+/*
+ * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_enc)
+ _aesni_xts_crypt 1
+SYM_FUNC_END(aesni_xts_enc)
+
+/*
+ * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_dec)
+ _aesni_xts_crypt 0
+SYM_FUNC_END(aesni_xts_dec)
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
deleted file mode 100644
index 8c9749ed0651..000000000000
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ /dev/null
@@ -1,2804 +0,0 @@
-########################################################################
-# Copyright (c) 2013, Intel Corporation
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the
-# distribution.
-#
-# * Neither the name of the Intel Corporation nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-#
-# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
-# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-########################################################################
-##
-## Authors:
-## Erdinc Ozturk <erdinc.ozturk@intel.com>
-## Vinodh Gopal <vinodh.gopal@intel.com>
-## James Guilford <james.guilford@intel.com>
-## Tim Chen <tim.c.chen@linux.intel.com>
-##
-## References:
-## This code was derived and highly optimized from the code described in paper:
-## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
-## on Intel Architecture Processors. August, 2010
-## The details of the implementation is explained in:
-## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
-## on Intel Architecture Processors. October, 2012.
-##
-## Assumptions:
-##
-##
-##
-## iv:
-## 0 1 2 3
-## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | Salt (From the SA) |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | Initialization Vector |
-## | (This is the sequence number from IPSec header) |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | 0x1 |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##
-##
-##
-## AAD:
-## AAD padded to 128 bits with 0
-## for example, assume AAD is a u32 vector
-##
-## if AAD is 8 bytes:
-## AAD[3] = {A0, A1}#
-## padded AAD in xmm register = {A1 A0 0 0}
-##
-## 0 1 2 3
-## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | SPI (A1) |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | 32-bit Sequence Number (A0) |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | 0x0 |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##
-## AAD Format with 32-bit Sequence Number
-##
-## if AAD is 12 bytes:
-## AAD[3] = {A0, A1, A2}#
-## padded AAD in xmm register = {A2 A1 A0 0}
-##
-## 0 1 2 3
-## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | SPI (A2) |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | 64-bit Extended Sequence Number {A1,A0} |
-## | |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-## | 0x0 |
-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##
-## AAD Format with 64-bit Extended Sequence Number
-##
-##
-## aadLen:
-## from the definition of the spec, aadLen can only be 8 or 12 bytes.
-## The code additionally supports aadLen of length 16 bytes.
-##
-## TLen:
-## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
-##
-## poly = x^128 + x^127 + x^126 + x^121 + 1
-## throughout the code, one tab and two tab indentations are used. one tab is
-## for GHASH part, two tabs is for AES part.
-##
-
-#include <linux/linkage.h>
-
-# constants in mergeable sections, linker can reorder and merge
-.section .rodata.cst16.POLY, "aM", @progbits, 16
-.align 16
-POLY: .octa 0xC2000000000000000000000000000001
-
-.section .rodata.cst16.POLY2, "aM", @progbits, 16
-.align 16
-POLY2: .octa 0xC20000000000000000000001C2000000
-
-.section .rodata.cst16.TWOONE, "aM", @progbits, 16
-.align 16
-TWOONE: .octa 0x00000001000000000000000000000001
-
-.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
-.align 16
-SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
-
-.section .rodata.cst16.ONE, "aM", @progbits, 16
-.align 16
-ONE: .octa 0x00000000000000000000000000000001
-
-.section .rodata.cst16.ONEf, "aM", @progbits, 16
-.align 16
-ONEf: .octa 0x01000000000000000000000000000000
-
-# order of these constants should not change.
-# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
-.section .rodata, "a", @progbits
-.align 16
-SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
-ALL_F: .octa 0xffffffffffffffffffffffffffffffff
- .octa 0x00000000000000000000000000000000
-
-.text
-
-
-#define AadHash 16*0
-#define AadLen 16*1
-#define InLen (16*1)+8
-#define PBlockEncKey 16*2
-#define OrigIV 16*3
-#define CurCount 16*4
-#define PBlockLen 16*5
-
-HashKey = 16*6 # store HashKey <<1 mod poly here
-HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
-HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
-HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
-HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
-HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
-HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
-HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
-HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
-HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
-HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
-HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
-HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
-HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
-HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
-HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
-
-#define arg1 %rdi
-#define arg2 %rsi
-#define arg3 %rdx
-#define arg4 %rcx
-#define arg5 %r8
-#define arg6 %r9
-#define keysize 2*15*16(arg1)
-
-i = 0
-j = 0
-
-out_order = 0
-in_order = 1
-DEC = 0
-ENC = 1
-
-.macro define_reg r n
-reg_\r = %xmm\n
-.endm
-
-.macro setreg
-.altmacro
-define_reg i %i
-define_reg j %j
-.noaltmacro
-.endm
-
-TMP1 = 16*0 # Temporary storage for AAD
-TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
-TMP3 = 16*2 # Temporary storage for AES State 3
-TMP4 = 16*3 # Temporary storage for AES State 4
-TMP5 = 16*4 # Temporary storage for AES State 5
-TMP6 = 16*5 # Temporary storage for AES State 6
-TMP7 = 16*6 # Temporary storage for AES State 7
-TMP8 = 16*7 # Temporary storage for AES State 8
-
-VARIABLE_OFFSET = 16*8
-
-################################
-# Utility Macros
-################################
-
-.macro FUNC_SAVE
- push %r12
- push %r13
- push %r15
-
- push %rbp
- mov %rsp, %rbp
-
- sub $VARIABLE_OFFSET, %rsp
- and $~63, %rsp # align rsp to 64 bytes
-.endm
-
-.macro FUNC_RESTORE
- mov %rbp, %rsp
- pop %rbp
-
- pop %r15
- pop %r13
- pop %r12
-.endm
-
-# Encryption of a single block
-.macro ENCRYPT_SINGLE_BLOCK REP XMM0
- vpxor (arg1), \XMM0, \XMM0
- i = 1
- setreg
-.rep \REP
- vaesenc 16*i(arg1), \XMM0, \XMM0
- i = (i+1)
- setreg
-.endr
- vaesenclast 16*i(arg1), \XMM0, \XMM0
-.endm
-
-# combined for GCM encrypt and decrypt functions
-# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r15, rax
-.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
- vmovdqu AadHash(arg2), %xmm8
- vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
- add arg5, InLen(arg2)
-
- # initialize the data pointer offset as zero
- xor %r11d, %r11d
-
- PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
- sub %r11, arg5
-
- mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
- and $-16, %r13 # r13 = r13 - (r13 mod 16)
-
- mov %r13, %r12
- shr $4, %r12
- and $7, %r12
- jz .L_initial_num_blocks_is_0\@
-
- cmp $7, %r12
- je .L_initial_num_blocks_is_7\@
- cmp $6, %r12
- je .L_initial_num_blocks_is_6\@
- cmp $5, %r12
- je .L_initial_num_blocks_is_5\@
- cmp $4, %r12
- je .L_initial_num_blocks_is_4\@
- cmp $3, %r12
- je .L_initial_num_blocks_is_3\@
- cmp $2, %r12
- je .L_initial_num_blocks_is_2\@
-
- jmp .L_initial_num_blocks_is_1\@
-
-.L_initial_num_blocks_is_7\@:
- \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*7, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_6\@:
- \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*6, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_5\@:
- \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*5, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_4\@:
- \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*4, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_3\@:
- \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*3, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_2\@:
- \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*2, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_1\@:
- \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*1, %r13
- jmp .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_0\@:
- \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-
-
-.L_initial_blocks_encrypted\@:
- test %r13, %r13
- je .L_zero_cipher_left\@
-
- sub $128, %r13
- je .L_eight_cipher_left\@
-
-
-
-
- vmovd %xmm9, %r15d
- and $255, %r15d
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-.L_encrypt_by_8_new\@:
- cmp $(255-8), %r15d
- jg .L_encrypt_by_8\@
-
-
-
- add $8, %r15b
- \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
- add $128, %r11
- sub $128, %r13
- jne .L_encrypt_by_8_new\@
-
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- jmp .L_eight_cipher_left\@
-
-.L_encrypt_by_8\@:
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- add $8, %r15b
- \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- add $128, %r11
- sub $128, %r13
- jne .L_encrypt_by_8_new\@
-
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-
-
-.L_eight_cipher_left\@:
- \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
-
-
-.L_zero_cipher_left\@:
- vmovdqu %xmm14, AadHash(arg2)
- vmovdqu %xmm9, CurCount(arg2)
-
- # check for 0 length
- mov arg5, %r13
- and $15, %r13 # r13 = (arg5 mod 16)
-
- je .L_multiple_of_16_bytes\@
-
- # handle the last <16 Byte block separately
-
- mov %r13, PBlockLen(arg2)
-
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
- vmovdqu %xmm9, CurCount(arg2)
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
- ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
- vmovdqu %xmm9, PBlockEncKey(arg2)
-
- cmp $16, arg5
- jge .L_large_enough_update\@
-
- lea (arg4,%r11,1), %r10
- mov %r13, %r12
-
- READ_PARTIAL_BLOCK %r10 %r12 %xmm1
-
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12 # adjust the shuffle mask pointer to be
- # able to shift 16-r13 bytes (r13 is the
- # number of bytes in plaintext mod 16)
-
- jmp .L_final_ghash_mul\@
-
-.L_large_enough_update\@:
- sub $16, %r11
- add %r13, %r11
-
- # receive the last <16 Byte block
- vmovdqu (arg4, %r11, 1), %xmm1
-
- sub %r13, %r11
- add $16, %r11
-
- lea SHIFT_MASK+16(%rip), %r12
- # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
- # (r13 is the number of bytes in plaintext mod 16)
- sub %r13, %r12
- # get the appropriate shuffle mask
- vmovdqu (%r12), %xmm2
- # shift right 16-r13 bytes
- vpshufb %xmm2, %xmm1, %xmm1
-
-.L_final_ghash_mul\@:
- .if \ENC_DEC == DEC
- vmovdqa %xmm1, %xmm2
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
- # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm2, %xmm2
- vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
- vpxor %xmm2, %xmm14, %xmm14
-
- vmovdqu %xmm14, AadHash(arg2)
- .else
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
- # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- vpxor %xmm9, %xmm14, %xmm14
-
- vmovdqu %xmm14, AadHash(arg2)
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
- .endif
-
-
- #############################
- # output r13 Bytes
- vmovq %xmm9, %rax
- cmp $8, %r13
- jle .L_less_than_8_bytes_left\@
-
- mov %rax, (arg3 , %r11)
- add $8, %r11
- vpsrldq $8, %xmm9, %xmm9
- vmovq %xmm9, %rax
- sub $8, %r13
-
-.L_less_than_8_bytes_left\@:
- movb %al, (arg3 , %r11)
- add $1, %r11
- shr $8, %rax
- sub $1, %r13
- jne .L_less_than_8_bytes_left\@
- #############################
-
-.L_multiple_of_16_bytes\@:
-.endm
-
-
-# GCM_COMPLETE Finishes update of tag of last partial block
-# Output: Authorization Tag (AUTH_TAG)
-# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
-.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
- vmovdqu AadHash(arg2), %xmm14
- vmovdqu HashKey(arg2), %xmm13
-
- mov PBlockLen(arg2), %r12
- test %r12, %r12
- je .L_partial_done\@
-
- #GHASH computation for the last <16 Byte block
- \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-
-.L_partial_done\@:
- mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
- shl $3, %r12 # convert into number of bits
- vmovd %r12d, %xmm15 # len(A) in xmm15
-
- mov InLen(arg2), %r12
- shl $3, %r12 # len(C) in bits (*128)
- vmovq %r12, %xmm1
- vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
- vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
-
- vpxor %xmm15, %xmm14, %xmm14
- \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
- vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
-
- vmovdqu OrigIV(arg2), %xmm9
-
- ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
-
- vpxor %xmm14, %xmm9, %xmm9
-
-
-
-.L_return_T\@:
- mov \AUTH_TAG, %r10 # r10 = authTag
- mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
-
- cmp $16, %r11
- je .L_T_16\@
-
- cmp $8, %r11
- jl .L_T_4\@
-
-.L_T_8\@:
- vmovq %xmm9, %rax
- mov %rax, (%r10)
- add $8, %r10
- sub $8, %r11
- vpsrldq $8, %xmm9, %xmm9
- test %r11, %r11
- je .L_return_T_done\@
-.L_T_4\@:
- vmovd %xmm9, %eax
- mov %eax, (%r10)
- add $4, %r10
- sub $4, %r11
- vpsrldq $4, %xmm9, %xmm9
- test %r11, %r11
- je .L_return_T_done\@
-.L_T_123\@:
- vmovd %xmm9, %eax
- cmp $2, %r11
- jl .L_T_1\@
- mov %ax, (%r10)
- cmp $2, %r11
- je .L_return_T_done\@
- add $2, %r10
- sar $16, %eax
-.L_T_1\@:
- mov %al, (%r10)
- jmp .L_return_T_done\@
-
-.L_T_16\@:
- vmovdqu %xmm9, (%r10)
-
-.L_return_T_done\@:
-.endm
-
-.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
-
- mov \AAD, %r10 # r10 = AAD
- mov \AADLEN, %r12 # r12 = aadLen
-
-
- mov %r12, %r11
-
- vpxor \T8, \T8, \T8
- vpxor \T7, \T7, \T7
- cmp $16, %r11
- jl .L_get_AAD_rest8\@
-.L_get_AAD_blocks\@:
- vmovdqu (%r10), \T7
- vpshufb SHUF_MASK(%rip), \T7, \T7
- vpxor \T7, \T8, \T8
- \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
- add $16, %r10
- sub $16, %r12
- sub $16, %r11
- cmp $16, %r11
- jge .L_get_AAD_blocks\@
- vmovdqu \T8, \T7
- test %r11, %r11
- je .L_get_AAD_done\@
-
- vpxor \T7, \T7, \T7
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some CT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-.L_get_AAD_rest8\@:
- cmp $4, %r11
- jle .L_get_AAD_rest4\@
- movq (%r10), \T1
- add $8, %r10
- sub $8, %r11
- vpslldq $8, \T1, \T1
- vpsrldq $8, \T7, \T7
- vpxor \T1, \T7, \T7
- jmp .L_get_AAD_rest8\@
-.L_get_AAD_rest4\@:
- test %r11, %r11
- jle .L_get_AAD_rest0\@
- mov (%r10), %eax
- movq %rax, \T1
- add $4, %r10
- sub $4, %r11
- vpslldq $12, \T1, \T1
- vpsrldq $4, \T7, \T7
- vpxor \T1, \T7, \T7
-.L_get_AAD_rest0\@:
- /* finalize: shift out the extra bytes we read, and align
- left. since pslldq can only shift by an immediate, we use
- vpshufb and a pair of shuffle masks */
- leaq ALL_F(%rip), %r11
- subq %r12, %r11
- vmovdqu 16(%r11), \T1
- andq $~3, %r11
- vpshufb (%r11), \T7, \T7
- vpand \T1, \T7, \T7
-.L_get_AAD_rest_final\@:
- vpshufb SHUF_MASK(%rip), \T7, \T7
- vpxor \T8, \T7, \T7
- \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
-
-.L_get_AAD_done\@:
- vmovdqu \T7, AadHash(arg2)
-.endm
-
-.macro INIT GHASH_MUL PRECOMPUTE
- mov arg6, %r11
- mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
- xor %r11d, %r11d
- mov %r11, InLen(arg2) # ctx_data.in_length = 0
-
- mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
- mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
- mov arg3, %rax
- movdqu (%rax), %xmm0
- movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
-
- vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
- movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
-
- vmovdqu (arg4), %xmm6 # xmm6 = HashKey
-
- vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
- ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
- vmovdqa %xmm6, %xmm2
- vpsllq $1, %xmm6, %xmm6
- vpsrlq $63, %xmm2, %xmm2
- vmovdqa %xmm2, %xmm1
- vpslldq $8, %xmm2, %xmm2
- vpsrldq $8, %xmm1, %xmm1
- vpor %xmm2, %xmm6, %xmm6
- #reduction
- vpshufd $0b00100100, %xmm1, %xmm2
- vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
- vpand POLY(%rip), %xmm2, %xmm2
- vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
- #######################################################################
- vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
-
- CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
-
- \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-.endm
-
-
-# Reads DLEN bytes starting at DPTR and stores in XMMDst
-# where 0 < DLEN < 16
-# Clobbers %rax, DLEN
-.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
- vpxor \XMMDst, \XMMDst, \XMMDst
-
- cmp $8, \DLEN
- jl .L_read_lt8_\@
- mov (\DPTR), %rax
- vpinsrq $0, %rax, \XMMDst, \XMMDst
- sub $8, \DLEN
- jz .L_done_read_partial_block_\@
- xor %eax, %eax
-.L_read_next_byte_\@:
- shl $8, %rax
- mov 7(\DPTR, \DLEN, 1), %al
- dec \DLEN
- jnz .L_read_next_byte_\@
- vpinsrq $1, %rax, \XMMDst, \XMMDst
- jmp .L_done_read_partial_block_\@
-.L_read_lt8_\@:
- xor %eax, %eax
-.L_read_next_byte_lt8_\@:
- shl $8, %rax
- mov -1(\DPTR, \DLEN, 1), %al
- dec \DLEN
- jnz .L_read_next_byte_lt8_\@
- vpinsrq $0, %rax, \XMMDst, \XMMDst
-.L_done_read_partial_block_\@:
-.endm
-
-# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
-# between update calls.
-# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
-# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
-# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
-.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
- AAD_HASH ENC_DEC
- mov PBlockLen(arg2), %r13
- test %r13, %r13
- je .L_partial_block_done_\@ # Leave Macro if no partial blocks
- # Read in input data without over reading
- cmp $16, \PLAIN_CYPH_LEN
- jl .L_fewer_than_16_bytes_\@
- vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
- jmp .L_data_read_\@
-
-.L_fewer_than_16_bytes_\@:
- lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
- mov \PLAIN_CYPH_LEN, %r12
- READ_PARTIAL_BLOCK %r10 %r12 %xmm1
-
- mov PBlockLen(arg2), %r13
-
-.L_data_read_\@: # Finished reading in data
-
- vmovdqu PBlockEncKey(arg2), %xmm9
- vmovdqu HashKey(arg2), %xmm13
-
- lea SHIFT_MASK(%rip), %r12
-
- # adjust the shuffle mask pointer to be able to shift r13 bytes
- # r16-r13 is the number of bytes in plaintext mod 16)
- add %r13, %r12
- vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
- vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
-
-.if \ENC_DEC == DEC
- vmovdqa %xmm1, %xmm3
- pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
-
- mov \PLAIN_CYPH_LEN, %r10
- add %r13, %r10
- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
- sub $16, %r10
- # Determine if partial block is not being filled and
- # shift mask accordingly
- jge .L_no_extra_mask_1_\@
- sub %r10, %r12
-.L_no_extra_mask_1_\@:
-
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
- # get the appropriate mask to mask out bottom r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
-
- vpand %xmm1, %xmm3, %xmm3
- vmovdqa SHUF_MASK(%rip), %xmm10
- vpshufb %xmm10, %xmm3, %xmm3
- vpshufb %xmm2, %xmm3, %xmm3
- vpxor %xmm3, \AAD_HASH, \AAD_HASH
-
- test %r10, %r10
- jl .L_partial_incomplete_1_\@
-
- # GHASH computation for the last <16 Byte block
- \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %eax,%eax
-
- mov %rax, PBlockLen(arg2)
- jmp .L_dec_done_\@
-.L_partial_incomplete_1_\@:
- add \PLAIN_CYPH_LEN, PBlockLen(arg2)
-.L_dec_done_\@:
- vmovdqu \AAD_HASH, AadHash(arg2)
-.else
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
-
- mov \PLAIN_CYPH_LEN, %r10
- add %r13, %r10
- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
- sub $16, %r10
- # Determine if partial block is not being filled and
- # shift mask accordingly
- jge .L_no_extra_mask_2_\@
- sub %r10, %r12
-.L_no_extra_mask_2_\@:
-
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
- # get the appropriate mask to mask out bottom r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9
-
- vmovdqa SHUF_MASK(%rip), %xmm1
- vpshufb %xmm1, %xmm9, %xmm9
- vpshufb %xmm2, %xmm9, %xmm9
- vpxor %xmm9, \AAD_HASH, \AAD_HASH
-
- test %r10, %r10
- jl .L_partial_incomplete_2_\@
-
- # GHASH computation for the last <16 Byte block
- \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %eax,%eax
-
- mov %rax, PBlockLen(arg2)
- jmp .L_encode_done_\@
-.L_partial_incomplete_2_\@:
- add \PLAIN_CYPH_LEN, PBlockLen(arg2)
-.L_encode_done_\@:
- vmovdqu \AAD_HASH, AadHash(arg2)
-
- vmovdqa SHUF_MASK(%rip), %xmm10
- # shuffle xmm9 back to output as ciphertext
- vpshufb %xmm10, %xmm9, %xmm9
- vpshufb %xmm2, %xmm9, %xmm9
-.endif
- # output encrypted Bytes
- test %r10, %r10
- jl .L_partial_fill_\@
- mov %r13, %r12
- mov $16, %r13
- # Set r13 to be the number of bytes to write out
- sub %r12, %r13
- jmp .L_count_set_\@
-.L_partial_fill_\@:
- mov \PLAIN_CYPH_LEN, %r13
-.L_count_set_\@:
- vmovdqa %xmm9, %xmm0
- vmovq %xmm0, %rax
- cmp $8, %r13
- jle .L_less_than_8_bytes_left_\@
-
- mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
- add $8, \DATA_OFFSET
- psrldq $8, %xmm0
- vmovq %xmm0, %rax
- sub $8, %r13
-.L_less_than_8_bytes_left_\@:
- movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
- add $1, \DATA_OFFSET
- shr $8, %rax
- sub $1, %r13
- jne .L_less_than_8_bytes_left_\@
-.L_partial_block_done_\@:
-.endm # PARTIAL_BLOCK
-
-###############################################################################
-# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-# Input: A and B (128-bits each, bit-reflected)
-# Output: C = A*B*x mod poly, (i.e. >>1 )
-# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
-# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
-###############################################################################
-.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
-
- vpshufd $0b01001110, \GH, \T2
- vpshufd $0b01001110, \HK, \T3
- vpxor \GH , \T2, \T2 # T2 = (a1+a0)
- vpxor \HK , \T3, \T3 # T3 = (b1+b0)
-
- vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
- vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
- vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
- vpxor \GH, \T2,\T2
- vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
-
- vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
- vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
- vpxor \T3, \GH, \GH
- vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
-
- #first phase of the reduction
- vpslld $31, \GH, \T2 # packed right shifting << 31
- vpslld $30, \GH, \T3 # packed right shifting shift << 30
- vpslld $25, \GH, \T4 # packed right shifting shift << 25
-
- vpxor \T3, \T2, \T2 # xor the shifted versions
- vpxor \T4, \T2, \T2
-
- vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
-
- vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
- vpxor \T2, \GH, \GH # first phase of the reduction complete
-
- #second phase of the reduction
-
- vpsrld $1,\GH, \T2 # packed left shifting >> 1
- vpsrld $2,\GH, \T3 # packed left shifting >> 2
- vpsrld $7,\GH, \T4 # packed left shifting >> 7
- vpxor \T3, \T2, \T2 # xor the shifted versions
- vpxor \T4, \T2, \T2
-
- vpxor \T5, \T2, \T2
- vpxor \T2, \GH, \GH
- vpxor \T1, \GH, \GH # the result is in GH
-
-
-.endm
-
-.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
-
- # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
- vmovdqa \HK, \T5
-
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
- vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_2_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
- vmovdqu \T5, HashKey_3(arg2)
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_3_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
- vmovdqu \T5, HashKey_4(arg2)
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_4_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
- vmovdqu \T5, HashKey_5(arg2)
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_5_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
- vmovdqu \T5, HashKey_6(arg2)
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_6_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
- vmovdqu \T5, HashKey_7(arg2)
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_7_k(arg2)
-
- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
- vmovdqu \T5, HashKey_8(arg2)
- vpshufd $0b01001110, \T5, \T1
- vpxor \T5, \T1, \T1
- vmovdqu \T1, HashKey_8_k(arg2)
-
-.endm
-
-## if a = number of total plaintext bytes
-## b = floor(a/16)
-## num_initial_blocks = b mod 4#
-## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
-## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, arg4 are used as pointers only, not modified
-
-.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
- i = (8-\num_initial_blocks)
- setreg
- vmovdqu AadHash(arg2), reg_i
-
- # start AES for num_initial_blocks blocks
- vmovdqu CurCount(arg2), \CTR
-
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, reg_i
- vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
- i = (i+1)
- setreg
-.endr
-
- vmovdqa (arg1), \T_key
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vpxor \T_key, reg_i, reg_i
- i = (i+1)
- setreg
-.endr
-
- j = 1
- setreg
-.rep \REP
- vmovdqa 16*j(arg1), \T_key
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vaesenc \T_key, reg_i, reg_i
- i = (i+1)
- setreg
-.endr
-
- j = (j+1)
- setreg
-.endr
-
- vmovdqa 16*j(arg1), \T_key
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vaesenclast \T_key, reg_i, reg_i
- i = (i+1)
- setreg
-.endr
-
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vmovdqu (arg4, %r11), \T1
- vpxor \T1, reg_i, reg_i
- vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
- add $16, %r11
-.if \ENC_DEC == DEC
- vmovdqa \T1, reg_i
-.endif
- vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
- i = (i+1)
- setreg
-.endr
-
-
- i = (8-\num_initial_blocks)
- j = (9-\num_initial_blocks)
- setreg
-
-.rep \num_initial_blocks
- vpxor reg_i, reg_j, reg_j
- GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
- i = (i+1)
- j = (j+1)
- setreg
-.endr
- # XMM8 has the combined result here
-
- vmovdqa \XMM8, TMP1(%rsp)
- vmovdqa \XMM8, \T3
-
- cmp $128, %r13
- jl .L_initial_blocks_done\@ # no need for precomputed constants
-
-###############################################################################
-# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM1
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM2
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM3
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM4
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM5
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM6
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM7
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM8
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-
- vmovdqa (arg1), \T_key
- vpxor \T_key, \XMM1, \XMM1
- vpxor \T_key, \XMM2, \XMM2
- vpxor \T_key, \XMM3, \XMM3
- vpxor \T_key, \XMM4, \XMM4
- vpxor \T_key, \XMM5, \XMM5
- vpxor \T_key, \XMM6, \XMM6
- vpxor \T_key, \XMM7, \XMM7
- vpxor \T_key, \XMM8, \XMM8
-
- i = 1
- setreg
-.rep \REP # do REP rounds
- vmovdqa 16*i(arg1), \T_key
- vaesenc \T_key, \XMM1, \XMM1
- vaesenc \T_key, \XMM2, \XMM2
- vaesenc \T_key, \XMM3, \XMM3
- vaesenc \T_key, \XMM4, \XMM4
- vaesenc \T_key, \XMM5, \XMM5
- vaesenc \T_key, \XMM6, \XMM6
- vaesenc \T_key, \XMM7, \XMM7
- vaesenc \T_key, \XMM8, \XMM8
- i = (i+1)
- setreg
-.endr
-
- vmovdqa 16*i(arg1), \T_key
- vaesenclast \T_key, \XMM1, \XMM1
- vaesenclast \T_key, \XMM2, \XMM2
- vaesenclast \T_key, \XMM3, \XMM3
- vaesenclast \T_key, \XMM4, \XMM4
- vaesenclast \T_key, \XMM5, \XMM5
- vaesenclast \T_key, \XMM6, \XMM6
- vaesenclast \T_key, \XMM7, \XMM7
- vaesenclast \T_key, \XMM8, \XMM8
-
- vmovdqu (arg4, %r11), \T1
- vpxor \T1, \XMM1, \XMM1
- vmovdqu \XMM1, (arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM1
- .endif
-
- vmovdqu 16*1(arg4, %r11), \T1
- vpxor \T1, \XMM2, \XMM2
- vmovdqu \XMM2, 16*1(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM2
- .endif
-
- vmovdqu 16*2(arg4, %r11), \T1
- vpxor \T1, \XMM3, \XMM3
- vmovdqu \XMM3, 16*2(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM3
- .endif
-
- vmovdqu 16*3(arg4, %r11), \T1
- vpxor \T1, \XMM4, \XMM4
- vmovdqu \XMM4, 16*3(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM4
- .endif
-
- vmovdqu 16*4(arg4, %r11), \T1
- vpxor \T1, \XMM5, \XMM5
- vmovdqu \XMM5, 16*4(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM5
- .endif
-
- vmovdqu 16*5(arg4, %r11), \T1
- vpxor \T1, \XMM6, \XMM6
- vmovdqu \XMM6, 16*5(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM6
- .endif
-
- vmovdqu 16*6(arg4, %r11), \T1
- vpxor \T1, \XMM7, \XMM7
- vmovdqu \XMM7, 16*6(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM7
- .endif
-
- vmovdqu 16*7(arg4, %r11), \T1
- vpxor \T1, \XMM8, \XMM8
- vmovdqu \XMM8, 16*7(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM8
- .endif
-
- add $128, %r11
-
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
- vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-
-###############################################################################
-
-.L_initial_blocks_done\@:
-
-.endm
-
-# encrypt 8 blocks at a time
-# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3, arg4 are used as pointers only, not modified
-# r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
-
- vmovdqa \XMM1, \T2
- vmovdqa \XMM2, TMP2(%rsp)
- vmovdqa \XMM3, TMP3(%rsp)
- vmovdqa \XMM4, TMP4(%rsp)
- vmovdqa \XMM5, TMP5(%rsp)
- vmovdqa \XMM6, TMP6(%rsp)
- vmovdqa \XMM7, TMP7(%rsp)
- vmovdqa \XMM8, TMP8(%rsp)
-
-.if \loop_idx == in_order
- vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
- vpaddd ONE(%rip), \XMM1, \XMM2
- vpaddd ONE(%rip), \XMM2, \XMM3
- vpaddd ONE(%rip), \XMM3, \XMM4
- vpaddd ONE(%rip), \XMM4, \XMM5
- vpaddd ONE(%rip), \XMM5, \XMM6
- vpaddd ONE(%rip), \XMM6, \XMM7
- vpaddd ONE(%rip), \XMM7, \XMM8
- vmovdqa \XMM8, \CTR
-
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-.else
- vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
- vpaddd ONEf(%rip), \XMM1, \XMM2
- vpaddd ONEf(%rip), \XMM2, \XMM3
- vpaddd ONEf(%rip), \XMM3, \XMM4
- vpaddd ONEf(%rip), \XMM4, \XMM5
- vpaddd ONEf(%rip), \XMM5, \XMM6
- vpaddd ONEf(%rip), \XMM6, \XMM7
- vpaddd ONEf(%rip), \XMM7, \XMM8
- vmovdqa \XMM8, \CTR
-.endif
-
-
- #######################################################################
-
- vmovdqu (arg1), \T1
- vpxor \T1, \XMM1, \XMM1
- vpxor \T1, \XMM2, \XMM2
- vpxor \T1, \XMM3, \XMM3
- vpxor \T1, \XMM4, \XMM4
- vpxor \T1, \XMM5, \XMM5
- vpxor \T1, \XMM6, \XMM6
- vpxor \T1, \XMM7, \XMM7
- vpxor \T1, \XMM8, \XMM8
-
- #######################################################################
-
-
-
-
-
- vmovdqu 16*1(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqu 16*2(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
-
- #######################################################################
-
- vmovdqu HashKey_8(arg2), \T5
- vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
- vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
-
- vpshufd $0b01001110, \T2, \T6
- vpxor \T2, \T6, \T6
-
- vmovdqu HashKey_8_k(arg2), \T5
- vpclmulqdq $0x00, \T5, \T6, \T6
-
- vmovdqu 16*3(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP2(%rsp), \T1
- vmovdqu HashKey_7(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_7_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*4(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- #######################################################################
-
- vmovdqa TMP3(%rsp), \T1
- vmovdqu HashKey_6(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_6_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*5(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP4(%rsp), \T1
- vmovdqu HashKey_5(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_5_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*6(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
-
- vmovdqa TMP5(%rsp), \T1
- vmovdqu HashKey_4(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_4_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*7(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP6(%rsp), \T1
- vmovdqu HashKey_3(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_3_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
-
- vmovdqu 16*8(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP7(%rsp), \T1
- vmovdqu HashKey_2(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_2_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
- #######################################################################
-
- vmovdqu 16*9(arg1), \T5
- vaesenc \T5, \XMM1, \XMM1
- vaesenc \T5, \XMM2, \XMM2
- vaesenc \T5, \XMM3, \XMM3
- vaesenc \T5, \XMM4, \XMM4
- vaesenc \T5, \XMM5, \XMM5
- vaesenc \T5, \XMM6, \XMM6
- vaesenc \T5, \XMM7, \XMM7
- vaesenc \T5, \XMM8, \XMM8
-
- vmovdqa TMP8(%rsp), \T1
- vmovdqu HashKey(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpshufd $0b01001110, \T1, \T3
- vpxor \T1, \T3, \T3
- vmovdqu HashKey_k(arg2), \T5
- vpclmulqdq $0x10, \T5, \T3, \T3
- vpxor \T3, \T6, \T6
-
- vpxor \T4, \T6, \T6
- vpxor \T7, \T6, \T6
-
- vmovdqu 16*10(arg1), \T5
-
- i = 11
- setreg
-.rep (\REP-9)
-
- vaesenc \T5, \XMM1, \XMM1
- vaesenc \T5, \XMM2, \XMM2
- vaesenc \T5, \XMM3, \XMM3
- vaesenc \T5, \XMM4, \XMM4
- vaesenc \T5, \XMM5, \XMM5
- vaesenc \T5, \XMM6, \XMM6
- vaesenc \T5, \XMM7, \XMM7
- vaesenc \T5, \XMM8, \XMM8
-
- vmovdqu 16*i(arg1), \T5
- i = i + 1
- setreg
-.endr
-
- i = 0
- j = 1
- setreg
-.rep 8
- vpxor 16*i(arg4, %r11), \T5, \T2
- .if \ENC_DEC == ENC
- vaesenclast \T2, reg_j, reg_j
- .else
- vaesenclast \T2, reg_j, \T3
- vmovdqu 16*i(arg4, %r11), reg_j
- vmovdqu \T3, 16*i(arg3, %r11)
- .endif
- i = (i+1)
- j = (j+1)
- setreg
-.endr
- #######################################################################
-
-
- vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
- vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
- vpxor \T3, \T7, \T7
- vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
-
-
-
- #######################################################################
- #first phase of the reduction
- #######################################################################
- vpslld $31, \T7, \T2 # packed right shifting << 31
- vpslld $30, \T7, \T3 # packed right shifting shift << 30
- vpslld $25, \T7, \T4 # packed right shifting shift << 25
-
- vpxor \T3, \T2, \T2 # xor the shifted versions
- vpxor \T4, \T2, \T2
-
- vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
-
- vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
- vpxor \T2, \T7, \T7 # first phase of the reduction complete
- #######################################################################
- .if \ENC_DEC == ENC
- vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
- .endif
-
- #######################################################################
- #second phase of the reduction
- vpsrld $1, \T7, \T2 # packed left shifting >> 1
- vpsrld $2, \T7, \T3 # packed left shifting >> 2
- vpsrld $7, \T7, \T4 # packed left shifting >> 7
- vpxor \T3, \T2, \T2 # xor the shifted versions
- vpxor \T4, \T2, \T2
-
- vpxor \T1, \T2, \T2
- vpxor \T2, \T7, \T7
- vpxor \T7, \T6, \T6 # the result is in T6
- #######################################################################
-
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-
-
- vpxor \T6, \XMM1, \XMM1
-
-
-
-.endm
-
-
-# GHASH the last 4 ciphertext blocks.
-.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
-
- ## Karatsuba Method
-
-
- vpshufd $0b01001110, \XMM1, \T2
- vpxor \XMM1, \T2, \T2
- vmovdqu HashKey_8(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM1, \T6
- vpclmulqdq $0x00, \T5, \XMM1, \T7
-
- vmovdqu HashKey_8_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM2, \T2
- vpxor \XMM2, \T2, \T2
- vmovdqu HashKey_7(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM2, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM2, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_7_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM3, \T2
- vpxor \XMM3, \T2, \T2
- vmovdqu HashKey_6(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM3, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM3, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_6_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM4, \T2
- vpxor \XMM4, \T2, \T2
- vmovdqu HashKey_5(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM4, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM4, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_5_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM5, \T2
- vpxor \XMM5, \T2, \T2
- vmovdqu HashKey_4(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM5, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM5, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_4_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM6, \T2
- vpxor \XMM6, \T2, \T2
- vmovdqu HashKey_3(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM6, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM6, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_3_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM7, \T2
- vpxor \XMM7, \T2, \T2
- vmovdqu HashKey_2(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM7, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM7, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_2_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vpshufd $0b01001110, \XMM8, \T2
- vpxor \XMM8, \T2, \T2
- vmovdqu HashKey(arg2), \T5
- vpclmulqdq $0x11, \T5, \XMM8, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM8, \T4
- vpxor \T4, \T7, \T7
-
- vmovdqu HashKey_k(arg2), \T3
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
- vpxor \T6, \XMM1, \XMM1
- vpxor \T7, \XMM1, \T2
-
-
-
-
- vpslldq $8, \T2, \T4
- vpsrldq $8, \T2, \T2
-
- vpxor \T4, \T7, \T7
- vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
- # the accumulated carry-less multiplications
-
- #######################################################################
- #first phase of the reduction
- vpslld $31, \T7, \T2 # packed right shifting << 31
- vpslld $30, \T7, \T3 # packed right shifting shift << 30
- vpslld $25, \T7, \T4 # packed right shifting shift << 25
-
- vpxor \T3, \T2, \T2 # xor the shifted versions
- vpxor \T4, \T2, \T2
-
- vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
-
- vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
- vpxor \T2, \T7, \T7 # first phase of the reduction complete
- #######################################################################
-
-
- #second phase of the reduction
- vpsrld $1, \T7, \T2 # packed left shifting >> 1
- vpsrld $2, \T7, \T3 # packed left shifting >> 2
- vpsrld $7, \T7, \T4 # packed left shifting >> 7
- vpxor \T3, \T2, \T2 # xor the shifted versions
- vpxor \T4, \T2, \T2
-
- vpxor \T1, \T2, \T2
- vpxor \T2, \T7, \T7
- vpxor \T7, \T6, \T6 # the result is in T6
-
-.endm
-
-#############################################################
-#void aesni_gcm_precomp_avx_gen2
-# (gcm_data *my_ctx_data,
-# gcm_context_data *data,
-# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
-# (from Security Association) concatenated with 8 byte
-# Initialisation Vector (from IPSec ESP Payload)
-# concatenated with 0x00000001. 16-byte aligned pointer. */
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
-# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-#############################################################
-SYM_FUNC_START(aesni_gcm_init_avx_gen2)
- FUNC_SAVE
- INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_init_avx_gen2)
-
-###############################################################################
-#void aesni_gcm_enc_update_avx_gen2(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# gcm_context_data *data,
-# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
-# const u8 *in, /* Plaintext input */
-# u64 plaintext_len) /* Length of data in Bytes for encryption. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
- FUNC_SAVE
- mov keysize, %eax
- cmp $32, %eax
- je key_256_enc_update
- cmp $16, %eax
- je key_128_enc_update
- # must be 192
- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
- FUNC_RESTORE
- RET
-key_128_enc_update:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
- FUNC_RESTORE
- RET
-key_256_enc_update:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
-
-###############################################################################
-#void aesni_gcm_dec_update_avx_gen2(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# gcm_context_data *data,
-# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
-# const u8 *in, /* Ciphertext input */
-# u64 plaintext_len) /* Length of data in Bytes for encryption. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
- FUNC_SAVE
- mov keysize,%eax
- cmp $32, %eax
- je key_256_dec_update
- cmp $16, %eax
- je key_128_dec_update
- # must be 192
- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
- FUNC_RESTORE
- RET
-key_128_dec_update:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
- FUNC_RESTORE
- RET
-key_256_dec_update:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
-
-###############################################################################
-#void aesni_gcm_finalize_avx_gen2(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# gcm_context_data *data,
-# u8 *auth_tag, /* Authenticated Tag output. */
-# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
-# Valid values are 16 (most likely), 12 or 8. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
- FUNC_SAVE
- mov keysize,%eax
- cmp $32, %eax
- je key_256_finalize
- cmp $16, %eax
- je key_128_finalize
- # must be 192
- GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
- FUNC_RESTORE
- RET
-key_128_finalize:
- GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
- FUNC_RESTORE
- RET
-key_256_finalize:
- GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
-
-###############################################################################
-# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-# Input: A and B (128-bits each, bit-reflected)
-# Output: C = A*B*x mod poly, (i.e. >>1 )
-# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
-# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
-###############################################################################
-.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
-
- vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
- vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
- vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
- vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
- vpxor \T3, \GH, \GH
-
-
- vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
- vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
-
- vpxor \T3, \T1, \T1
- vpxor \T2, \GH, \GH
-
- #######################################################################
- #first phase of the reduction
- vmovdqa POLY2(%rip), \T3
-
- vpclmulqdq $0x01, \GH, \T3, \T2
- vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
-
- vpxor \T2, \GH, \GH # first phase of the reduction complete
- #######################################################################
- #second phase of the reduction
- vpclmulqdq $0x00, \GH, \T3, \T2
- vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
-
- vpclmulqdq $0x10, \GH, \T3, \GH
- vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
-
- vpxor \T2, \GH, \GH # second phase of the reduction complete
- #######################################################################
- vpxor \T1, \GH, \GH # the result is in GH
-
-
-.endm
-
-.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
-
- # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
- vmovdqa \HK, \T5
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
- vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
-
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
- vmovdqu \T5, HashKey_3(arg2)
-
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
- vmovdqu \T5, HashKey_4(arg2)
-
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
- vmovdqu \T5, HashKey_5(arg2)
-
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
- vmovdqu \T5, HashKey_6(arg2)
-
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
- vmovdqu \T5, HashKey_7(arg2)
-
- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
- vmovdqu \T5, HashKey_8(arg2)
-
-.endm
-
-## if a = number of total plaintext bytes
-## b = floor(a/16)
-## num_initial_blocks = b mod 4#
-## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
-## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, arg4 are used as pointers only, not modified
-
-.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
- i = (8-\num_initial_blocks)
- setreg
- vmovdqu AadHash(arg2), reg_i
-
- # start AES for num_initial_blocks blocks
- vmovdqu CurCount(arg2), \CTR
-
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, reg_i
- vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
- i = (i+1)
- setreg
-.endr
-
- vmovdqa (arg1), \T_key
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vpxor \T_key, reg_i, reg_i
- i = (i+1)
- setreg
-.endr
-
- j = 1
- setreg
-.rep \REP
- vmovdqa 16*j(arg1), \T_key
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vaesenc \T_key, reg_i, reg_i
- i = (i+1)
- setreg
-.endr
-
- j = (j+1)
- setreg
-.endr
-
-
- vmovdqa 16*j(arg1), \T_key
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vaesenclast \T_key, reg_i, reg_i
- i = (i+1)
- setreg
-.endr
-
- i = (9-\num_initial_blocks)
- setreg
-.rep \num_initial_blocks
- vmovdqu (arg4, %r11), \T1
- vpxor \T1, reg_i, reg_i
- vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
- # num_initial_blocks blocks
- add $16, %r11
-.if \ENC_DEC == DEC
- vmovdqa \T1, reg_i
-.endif
- vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
- i = (i+1)
- setreg
-.endr
-
-
- i = (8-\num_initial_blocks)
- j = (9-\num_initial_blocks)
- setreg
-
-.rep \num_initial_blocks
- vpxor reg_i, reg_j, reg_j
- GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
- i = (i+1)
- j = (j+1)
- setreg
-.endr
- # XMM8 has the combined result here
-
- vmovdqa \XMM8, TMP1(%rsp)
- vmovdqa \XMM8, \T3
-
- cmp $128, %r13
- jl .L_initial_blocks_done\@ # no need for precomputed constants
-
-###############################################################################
-# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM1
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM2
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM3
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM4
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM5
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM6
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM7
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
-
- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
- vmovdqa \CTR, \XMM8
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-
- vmovdqa (arg1), \T_key
- vpxor \T_key, \XMM1, \XMM1
- vpxor \T_key, \XMM2, \XMM2
- vpxor \T_key, \XMM3, \XMM3
- vpxor \T_key, \XMM4, \XMM4
- vpxor \T_key, \XMM5, \XMM5
- vpxor \T_key, \XMM6, \XMM6
- vpxor \T_key, \XMM7, \XMM7
- vpxor \T_key, \XMM8, \XMM8
-
- i = 1
- setreg
-.rep \REP # do REP rounds
- vmovdqa 16*i(arg1), \T_key
- vaesenc \T_key, \XMM1, \XMM1
- vaesenc \T_key, \XMM2, \XMM2
- vaesenc \T_key, \XMM3, \XMM3
- vaesenc \T_key, \XMM4, \XMM4
- vaesenc \T_key, \XMM5, \XMM5
- vaesenc \T_key, \XMM6, \XMM6
- vaesenc \T_key, \XMM7, \XMM7
- vaesenc \T_key, \XMM8, \XMM8
- i = (i+1)
- setreg
-.endr
-
-
- vmovdqa 16*i(arg1), \T_key
- vaesenclast \T_key, \XMM1, \XMM1
- vaesenclast \T_key, \XMM2, \XMM2
- vaesenclast \T_key, \XMM3, \XMM3
- vaesenclast \T_key, \XMM4, \XMM4
- vaesenclast \T_key, \XMM5, \XMM5
- vaesenclast \T_key, \XMM6, \XMM6
- vaesenclast \T_key, \XMM7, \XMM7
- vaesenclast \T_key, \XMM8, \XMM8
-
- vmovdqu (arg4, %r11), \T1
- vpxor \T1, \XMM1, \XMM1
- vmovdqu \XMM1, (arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM1
- .endif
-
- vmovdqu 16*1(arg4, %r11), \T1
- vpxor \T1, \XMM2, \XMM2
- vmovdqu \XMM2, 16*1(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM2
- .endif
-
- vmovdqu 16*2(arg4, %r11), \T1
- vpxor \T1, \XMM3, \XMM3
- vmovdqu \XMM3, 16*2(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM3
- .endif
-
- vmovdqu 16*3(arg4, %r11), \T1
- vpxor \T1, \XMM4, \XMM4
- vmovdqu \XMM4, 16*3(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM4
- .endif
-
- vmovdqu 16*4(arg4, %r11), \T1
- vpxor \T1, \XMM5, \XMM5
- vmovdqu \XMM5, 16*4(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM5
- .endif
-
- vmovdqu 16*5(arg4, %r11), \T1
- vpxor \T1, \XMM6, \XMM6
- vmovdqu \XMM6, 16*5(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM6
- .endif
-
- vmovdqu 16*6(arg4, %r11), \T1
- vpxor \T1, \XMM7, \XMM7
- vmovdqu \XMM7, 16*6(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM7
- .endif
-
- vmovdqu 16*7(arg4, %r11), \T1
- vpxor \T1, \XMM8, \XMM8
- vmovdqu \XMM8, 16*7(arg3 , %r11)
- .if \ENC_DEC == DEC
- vmovdqa \T1, \XMM8
- .endif
-
- add $128, %r11
-
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
- vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
- # the corresponding ciphertext
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-
-###############################################################################
-
-.L_initial_blocks_done\@:
-
-
-.endm
-
-
-
-# encrypt 8 blocks at a time
-# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3, arg4 are used as pointers only, not modified
-# r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
-
- vmovdqa \XMM1, \T2
- vmovdqa \XMM2, TMP2(%rsp)
- vmovdqa \XMM3, TMP3(%rsp)
- vmovdqa \XMM4, TMP4(%rsp)
- vmovdqa \XMM5, TMP5(%rsp)
- vmovdqa \XMM6, TMP6(%rsp)
- vmovdqa \XMM7, TMP7(%rsp)
- vmovdqa \XMM8, TMP8(%rsp)
-
-.if \loop_idx == in_order
- vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
- vpaddd ONE(%rip), \XMM1, \XMM2
- vpaddd ONE(%rip), \XMM2, \XMM3
- vpaddd ONE(%rip), \XMM3, \XMM4
- vpaddd ONE(%rip), \XMM4, \XMM5
- vpaddd ONE(%rip), \XMM5, \XMM6
- vpaddd ONE(%rip), \XMM6, \XMM7
- vpaddd ONE(%rip), \XMM7, \XMM8
- vmovdqa \XMM8, \CTR
-
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-.else
- vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
- vpaddd ONEf(%rip), \XMM1, \XMM2
- vpaddd ONEf(%rip), \XMM2, \XMM3
- vpaddd ONEf(%rip), \XMM3, \XMM4
- vpaddd ONEf(%rip), \XMM4, \XMM5
- vpaddd ONEf(%rip), \XMM5, \XMM6
- vpaddd ONEf(%rip), \XMM6, \XMM7
- vpaddd ONEf(%rip), \XMM7, \XMM8
- vmovdqa \XMM8, \CTR
-.endif
-
-
- #######################################################################
-
- vmovdqu (arg1), \T1
- vpxor \T1, \XMM1, \XMM1
- vpxor \T1, \XMM2, \XMM2
- vpxor \T1, \XMM3, \XMM3
- vpxor \T1, \XMM4, \XMM4
- vpxor \T1, \XMM5, \XMM5
- vpxor \T1, \XMM6, \XMM6
- vpxor \T1, \XMM7, \XMM7
- vpxor \T1, \XMM8, \XMM8
-
- #######################################################################
-
-
-
-
-
- vmovdqu 16*1(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqu 16*2(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
-
- #######################################################################
-
- vmovdqu HashKey_8(arg2), \T5
- vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
- vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
- vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
- vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
- vpxor \T5, \T6, \T6
-
- vmovdqu 16*3(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP2(%rsp), \T1
- vmovdqu HashKey_7(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*4(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- #######################################################################
-
- vmovdqa TMP3(%rsp), \T1
- vmovdqu HashKey_6(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*5(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP4(%rsp), \T1
- vmovdqu HashKey_5(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*6(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
-
- vmovdqa TMP5(%rsp), \T1
- vmovdqu HashKey_4(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*7(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP6(%rsp), \T1
- vmovdqu HashKey_3(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vmovdqu 16*8(arg1), \T1
- vaesenc \T1, \XMM1, \XMM1
- vaesenc \T1, \XMM2, \XMM2
- vaesenc \T1, \XMM3, \XMM3
- vaesenc \T1, \XMM4, \XMM4
- vaesenc \T1, \XMM5, \XMM5
- vaesenc \T1, \XMM6, \XMM6
- vaesenc \T1, \XMM7, \XMM7
- vaesenc \T1, \XMM8, \XMM8
-
- vmovdqa TMP7(%rsp), \T1
- vmovdqu HashKey_2(arg2), \T5
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T4
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
-
- #######################################################################
-
- vmovdqu 16*9(arg1), \T5
- vaesenc \T5, \XMM1, \XMM1
- vaesenc \T5, \XMM2, \XMM2
- vaesenc \T5, \XMM3, \XMM3
- vaesenc \T5, \XMM4, \XMM4
- vaesenc \T5, \XMM5, \XMM5
- vaesenc \T5, \XMM6, \XMM6
- vaesenc \T5, \XMM7, \XMM7
- vaesenc \T5, \XMM8, \XMM8
-
- vmovdqa TMP8(%rsp), \T1
- vmovdqu HashKey(arg2), \T5
-
- vpclmulqdq $0x00, \T5, \T1, \T3
- vpxor \T3, \T7, \T7
-
- vpclmulqdq $0x01, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x10, \T5, \T1, \T3
- vpxor \T3, \T6, \T6
-
- vpclmulqdq $0x11, \T5, \T1, \T3
- vpxor \T3, \T4, \T1
-
-
- vmovdqu 16*10(arg1), \T5
-
- i = 11
- setreg
-.rep (\REP-9)
- vaesenc \T5, \XMM1, \XMM1
- vaesenc \T5, \XMM2, \XMM2
- vaesenc \T5, \XMM3, \XMM3
- vaesenc \T5, \XMM4, \XMM4
- vaesenc \T5, \XMM5, \XMM5
- vaesenc \T5, \XMM6, \XMM6
- vaesenc \T5, \XMM7, \XMM7
- vaesenc \T5, \XMM8, \XMM8
-
- vmovdqu 16*i(arg1), \T5
- i = i + 1
- setreg
-.endr
-
- i = 0
- j = 1
- setreg
-.rep 8
- vpxor 16*i(arg4, %r11), \T5, \T2
- .if \ENC_DEC == ENC
- vaesenclast \T2, reg_j, reg_j
- .else
- vaesenclast \T2, reg_j, \T3
- vmovdqu 16*i(arg4, %r11), reg_j
- vmovdqu \T3, 16*i(arg3, %r11)
- .endif
- i = (i+1)
- j = (j+1)
- setreg
-.endr
- #######################################################################
-
-
- vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
- vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
- vpxor \T3, \T7, \T7
- vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
-
-
-
- #######################################################################
- #first phase of the reduction
- vmovdqa POLY2(%rip), \T3
-
- vpclmulqdq $0x01, \T7, \T3, \T2
- vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
-
- vpxor \T2, \T7, \T7 # first phase of the reduction complete
- #######################################################################
- .if \ENC_DEC == ENC
- vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
- .endif
-
- #######################################################################
- #second phase of the reduction
- vpclmulqdq $0x00, \T7, \T3, \T2
- vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
-
- vpclmulqdq $0x10, \T7, \T3, \T4
- vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
-
- vpxor \T2, \T4, \T4 # second phase of the reduction complete
- #######################################################################
- vpxor \T4, \T1, \T1 # the result is in T1
-
- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
-
-
- vpxor \T1, \XMM1, \XMM1
-
-
-
-.endm
-
-
-# GHASH the last 4 ciphertext blocks.
-.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
-
- ## Karatsuba Method
-
- vmovdqu HashKey_8(arg2), \T5
-
- vpshufd $0b01001110, \XMM1, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM1, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM1, \T6
- vpclmulqdq $0x00, \T5, \XMM1, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \XMM1
-
- ######################
-
- vmovdqu HashKey_7(arg2), \T5
- vpshufd $0b01001110, \XMM2, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM2, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM2, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM2, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vmovdqu HashKey_6(arg2), \T5
- vpshufd $0b01001110, \XMM3, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM3, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM3, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM3, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vmovdqu HashKey_5(arg2), \T5
- vpshufd $0b01001110, \XMM4, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM4, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM4, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM4, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vmovdqu HashKey_4(arg2), \T5
- vpshufd $0b01001110, \XMM5, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM5, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM5, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM5, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vmovdqu HashKey_3(arg2), \T5
- vpshufd $0b01001110, \XMM6, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM6, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM6, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM6, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vmovdqu HashKey_2(arg2), \T5
- vpshufd $0b01001110, \XMM7, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM7, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM7, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM7, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
-
- ######################
-
- vmovdqu HashKey(arg2), \T5
- vpshufd $0b01001110, \XMM8, \T2
- vpshufd $0b01001110, \T5, \T3
- vpxor \XMM8, \T2, \T2
- vpxor \T5, \T3, \T3
-
- vpclmulqdq $0x11, \T5, \XMM8, \T4
- vpxor \T4, \T6, \T6
-
- vpclmulqdq $0x00, \T5, \XMM8, \T4
- vpxor \T4, \T7, \T7
-
- vpclmulqdq $0x00, \T3, \T2, \T2
-
- vpxor \T2, \XMM1, \XMM1
- vpxor \T6, \XMM1, \XMM1
- vpxor \T7, \XMM1, \T2
-
-
-
-
- vpslldq $8, \T2, \T4
- vpsrldq $8, \T2, \T2
-
- vpxor \T4, \T7, \T7
- vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
- # accumulated carry-less multiplications
-
- #######################################################################
- #first phase of the reduction
- vmovdqa POLY2(%rip), \T3
-
- vpclmulqdq $0x01, \T7, \T3, \T2
- vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
-
- vpxor \T2, \T7, \T7 # first phase of the reduction complete
- #######################################################################
-
-
- #second phase of the reduction
- vpclmulqdq $0x00, \T7, \T3, \T2
- vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
-
- vpclmulqdq $0x10, \T7, \T3, \T4
- vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
-
- vpxor \T2, \T4, \T4 # second phase of the reduction complete
- #######################################################################
- vpxor \T4, \T6, \T6 # the result is in T6
-.endm
-
-
-
-#############################################################
-#void aesni_gcm_init_avx_gen4
-# (gcm_data *my_ctx_data,
-# gcm_context_data *data,
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
-# (from Security Association) concatenated with 8 byte
-# Initialisation Vector (from IPSec ESP Payload)
-# concatenated with 0x00000001. 16-byte aligned pointer. */
-# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
-# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-#############################################################
-SYM_FUNC_START(aesni_gcm_init_avx_gen4)
- FUNC_SAVE
- INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_init_avx_gen4)
-
-###############################################################################
-#void aesni_gcm_enc_avx_gen4(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# gcm_context_data *data,
-# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
-# const u8 *in, /* Plaintext input */
-# u64 plaintext_len) /* Length of data in Bytes for encryption. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
- FUNC_SAVE
- mov keysize,%eax
- cmp $32, %eax
- je key_256_enc_update4
- cmp $16, %eax
- je key_128_enc_update4
- # must be 192
- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
- FUNC_RESTORE
- RET
-key_128_enc_update4:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
- FUNC_RESTORE
- RET
-key_256_enc_update4:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
-
-###############################################################################
-#void aesni_gcm_dec_update_avx_gen4(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# gcm_context_data *data,
-# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
-# const u8 *in, /* Ciphertext input */
-# u64 plaintext_len) /* Length of data in Bytes for encryption. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
- FUNC_SAVE
- mov keysize,%eax
- cmp $32, %eax
- je key_256_dec_update4
- cmp $16, %eax
- je key_128_dec_update4
- # must be 192
- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
- FUNC_RESTORE
- RET
-key_128_dec_update4:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
- FUNC_RESTORE
- RET
-key_256_dec_update4:
- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
-
-###############################################################################
-#void aesni_gcm_finalize_avx_gen4(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# gcm_context_data *data,
-# u8 *auth_tag, /* Authenticated Tag output. */
-# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
-# Valid values are 16 (most likely), 12 or 8. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
- FUNC_SAVE
- mov keysize,%eax
- cmp $32, %eax
- je key_256_finalize4
- cmp $16, %eax
- je key_128_finalize4
- # must be 192
- GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
- FUNC_RESTORE
- RET
-key_128_finalize4:
- GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
- FUNC_RESTORE
- RET
-key_256_finalize4:
- GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index b1d90c25975a..b0dd83555499 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Support for Intel AES-NI instructions. This file contains glue
- * code, the real AES implementation is in intel-aes_asm.S.
+ * Support for AES-NI and VAES instructions. This file contains glue code.
+ * The real AES implementations are in aesni-intel_asm.S and other .S files.
*
* Copyright (C) 2008, Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
@@ -13,6 +13,8 @@
* Tadeusz Struk (tadeusz.struk@intel.com)
* Aidan O'Mahony (aidan.o.mahony@intel.com)
* Copyright (c) 2010, Intel Corporation.
+ *
+ * Copyright 2024 Google LLC
*/
#include <linux/hardirq.h>
@@ -40,46 +42,15 @@
#define AESNI_ALIGN 16
#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1))
-#define RFC4106_HASH_SUBKEY_SIZE 16
#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
-/* This data is stored at the end of the crypto_tfm struct.
- * It's a type of per "session" data storage location.
- * This needs to be 16 byte aligned.
- */
-struct aesni_rfc4106_gcm_ctx {
- u8 hash_subkey[16] AESNI_ALIGN_ATTR;
- struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
- u8 nonce[4];
-};
-
-struct generic_gcmaes_ctx {
- u8 hash_subkey[16] AESNI_ALIGN_ATTR;
- struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
-};
-
struct aesni_xts_ctx {
struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR;
struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR;
};
-#define GCM_BLOCK_LEN 16
-
-struct gcm_context_data {
- /* init, update and finalize context data */
- u8 aad_hash[GCM_BLOCK_LEN];
- u64 aad_length;
- u64 in_length;
- u8 partial_block_enc_key[GCM_BLOCK_LEN];
- u8 orig_IV[GCM_BLOCK_LEN];
- u8 current_counter[GCM_BLOCK_LEN];
- u64 partial_block_len;
- u64 unused;
- u8 hash_keys[GCM_BLOCK_LEN * 16];
-};
-
static inline void *aes_align_addr(void *addr)
{
if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN)
@@ -87,8 +58,8 @@ static inline void *aes_align_addr(void *addr)
return PTR_ALIGN(addr, AESNI_ALIGN);
}
-asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
- unsigned int key_len);
+asmlinkage void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ unsigned int key_len);
asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in);
asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
@@ -104,14 +75,11 @@ asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
-#define AVX_GEN2_OPTSIZE 640
-#define AVX_GEN4_OPTSIZE 4096
-
-asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
-asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
#ifdef CONFIG_X86_64
@@ -119,23 +87,6 @@ asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc);
-/* Scatter / Gather routines, with args similar to above */
-asmlinkage void aesni_gcm_init(void *ctx,
- struct gcm_context_data *gdata,
- u8 *iv,
- u8 *hash_subkey, const u8 *aad,
- unsigned long aad_len);
-asmlinkage void aesni_gcm_enc_update(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len);
-asmlinkage void aesni_gcm_dec_update(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in,
- unsigned long ciphertext_len);
-asmlinkage void aesni_gcm_finalize(void *ctx,
- struct gcm_context_data *gdata,
- u8 *auth_tag, unsigned long auth_tag_len);
-
asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
@@ -155,67 +106,6 @@ asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv,
asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv,
const void *keys, u8 *out, unsigned int num_bytes,
unsigned int byte_ctr);
-
-/*
- * asmlinkage void aesni_gcm_init_avx_gen2()
- * gcm_data *my_ctx_data, context data
- * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
- */
-asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data,
- struct gcm_context_data *gdata,
- u8 *iv,
- u8 *hash_subkey,
- const u8 *aad,
- unsigned long aad_len);
-
-asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len);
-asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in,
- unsigned long ciphertext_len);
-asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
- struct gcm_context_data *gdata,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-/*
- * asmlinkage void aesni_gcm_init_avx_gen4()
- * gcm_data *my_ctx_data, context data
- * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
- */
-asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data,
- struct gcm_context_data *gdata,
- u8 *iv,
- u8 *hash_subkey,
- const u8 *aad,
- unsigned long aad_len);
-
-asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len);
-asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in,
- unsigned long ciphertext_len);
-asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
- struct gcm_context_data *gdata,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2);
-
-static inline struct
-aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
-{
- return aes_align_addr(crypto_aead_ctx(tfm));
-}
-
-static inline struct
-generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm)
-{
- return aes_align_addr(crypto_aead_ctx(tfm));
-}
#endif
static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
@@ -233,19 +123,17 @@ static int aes_set_key_common(struct crypto_aes_ctx *ctx,
{
int err;
- if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
- key_len != AES_KEYSIZE_256)
- return -EINVAL;
-
if (!crypto_simd_usable())
- err = aes_expandkey(ctx, in_key, key_len);
- else {
- kernel_fpu_begin();
- err = aesni_set_key(ctx, in_key, key_len);
- kernel_fpu_end();
- }
+ return aes_expandkey(ctx, in_key, key_len);
- return err;
+ err = aes_check_keylen(key_len);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ aesni_set_key(ctx, in_key, key_len);
+ kernel_fpu_end();
+ return 0;
}
static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
@@ -591,293 +479,9 @@ static int xctr_crypt(struct skcipher_request *req)
}
return err;
}
-
-static int
-rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
-{
- struct crypto_aes_ctx ctx;
- int ret;
-
- ret = aes_expandkey(&ctx, key, key_len);
- if (ret)
- return ret;
-
- /* Clear the data in the hash sub key container to zero.*/
- /* We want to cipher all zeros to create the hash sub key. */
- memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
-
- aes_encrypt(&ctx, hash_subkey, hash_subkey);
-
- memzero_explicit(&ctx, sizeof(ctx));
- return 0;
-}
-
-static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
- unsigned int key_len)
-{
- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
-
- if (key_len < 4)
- return -EINVAL;
-
- /*Account for 4 byte nonce at the end.*/
- key_len -= 4;
-
- memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
-
- return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?:
- rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
-}
-
-/* This is the Integrity Check Value (aka the authentication tag) length and can
- * be 8, 12 or 16 bytes long. */
-static int common_rfc4106_set_authsize(struct crypto_aead *aead,
- unsigned int authsize)
-{
- switch (authsize) {
- case 8:
- case 12:
- case 16:
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
- unsigned int authsize)
-{
- switch (authsize) {
- case 4:
- case 8:
- case 12:
- case 13:
- case 14:
- case 15:
- case 16:
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
- unsigned int assoclen, u8 *hash_subkey,
- u8 *iv, void *aes_ctx, u8 *auth_tag,
- unsigned long auth_tag_len)
-{
- u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8);
- struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN);
- unsigned long left = req->cryptlen;
- struct scatter_walk assoc_sg_walk;
- struct skcipher_walk walk;
- bool do_avx, do_avx2;
- u8 *assocmem = NULL;
- u8 *assoc;
- int err;
-
- if (!enc)
- left -= auth_tag_len;
-
- do_avx = (left >= AVX_GEN2_OPTSIZE);
- do_avx2 = (left >= AVX_GEN4_OPTSIZE);
-
- /* Linearize assoc, if not already linear */
- if (req->src->length >= assoclen && req->src->length) {
- scatterwalk_start(&assoc_sg_walk, req->src);
- assoc = scatterwalk_map(&assoc_sg_walk);
- } else {
- gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
- GFP_KERNEL : GFP_ATOMIC;
-
- /* assoc can be any length, so must be on heap */
- assocmem = kmalloc(assoclen, flags);
- if (unlikely(!assocmem))
- return -ENOMEM;
- assoc = assocmem;
-
- scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
- }
-
- kernel_fpu_begin();
- if (static_branch_likely(&gcm_use_avx2) && do_avx2)
- aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc,
- assoclen);
- else if (static_branch_likely(&gcm_use_avx) && do_avx)
- aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc,
- assoclen);
- else
- aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen);
- kernel_fpu_end();
-
- if (!assocmem)
- scatterwalk_unmap(assoc);
- else
- kfree(assocmem);
-
- err = enc ? skcipher_walk_aead_encrypt(&walk, req, false)
- : skcipher_walk_aead_decrypt(&walk, req, false);
-
- while (walk.nbytes > 0) {
- kernel_fpu_begin();
- if (static_branch_likely(&gcm_use_avx2) && do_avx2) {
- if (enc)
- aesni_gcm_enc_update_avx_gen4(aes_ctx, data,
- walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes);
- else
- aesni_gcm_dec_update_avx_gen4(aes_ctx, data,
- walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes);
- } else if (static_branch_likely(&gcm_use_avx) && do_avx) {
- if (enc)
- aesni_gcm_enc_update_avx_gen2(aes_ctx, data,
- walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes);
- else
- aesni_gcm_dec_update_avx_gen2(aes_ctx, data,
- walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes);
- } else if (enc) {
- aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr,
- walk.src.virt.addr, walk.nbytes);
- } else {
- aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr,
- walk.src.virt.addr, walk.nbytes);
- }
- kernel_fpu_end();
-
- err = skcipher_walk_done(&walk, 0);
- }
-
- if (err)
- return err;
-
- kernel_fpu_begin();
- if (static_branch_likely(&gcm_use_avx2) && do_avx2)
- aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag,
- auth_tag_len);
- else if (static_branch_likely(&gcm_use_avx) && do_avx)
- aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag,
- auth_tag_len);
- else
- aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len);
- kernel_fpu_end();
-
- return 0;
-}
-
-static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
- u8 *hash_subkey, u8 *iv, void *aes_ctx)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- u8 auth_tag[16];
- int err;
-
- err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx,
- auth_tag, auth_tag_len);
- if (err)
- return err;
-
- scatterwalk_map_and_copy(auth_tag, req->dst,
- req->assoclen + req->cryptlen,
- auth_tag_len, 1);
- return 0;
-}
-
-static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
- u8 *hash_subkey, u8 *iv, void *aes_ctx)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- u8 auth_tag_msg[16];
- u8 auth_tag[16];
- int err;
-
- err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx,
- auth_tag, auth_tag_len);
- if (err)
- return err;
-
- /* Copy out original auth_tag */
- scatterwalk_map_and_copy(auth_tag_msg, req->src,
- req->assoclen + req->cryptlen - auth_tag_len,
- auth_tag_len, 0);
-
- /* Compare generated tag with passed in tag. */
- if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) {
- memzero_explicit(auth_tag, sizeof(auth_tag));
- return -EBADMSG;
- }
- return 0;
-}
-
-static int helper_rfc4106_encrypt(struct aead_request *req)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
- void *aes_ctx = &(ctx->aes_key_expanded);
- u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
- u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
- unsigned int i;
- __be32 counter = cpu_to_be32(1);
-
- /* Assuming we are supporting rfc4106 64-bit extended */
- /* sequence numbers We need to have the AAD length equal */
- /* to 16 or 20 bytes */
- if (unlikely(req->assoclen != 16 && req->assoclen != 20))
- return -EINVAL;
-
- /* IV below built */
- for (i = 0; i < 4; i++)
- *(iv+i) = ctx->nonce[i];
- for (i = 0; i < 8; i++)
- *(iv+4+i) = req->iv[i];
- *((__be32 *)(iv+12)) = counter;
-
- return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
- aes_ctx);
-}
-
-static int helper_rfc4106_decrypt(struct aead_request *req)
-{
- __be32 counter = cpu_to_be32(1);
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
- void *aes_ctx = &(ctx->aes_key_expanded);
- u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
- u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
- unsigned int i;
-
- if (unlikely(req->assoclen != 16 && req->assoclen != 20))
- return -EINVAL;
-
- /* Assuming we are supporting rfc4106 64-bit extended */
- /* sequence numbers We need to have the AAD length */
- /* equal to 16 or 20 bytes */
-
- /* IV below built */
- for (i = 0; i < 4; i++)
- *(iv+i) = ctx->nonce[i];
- for (i = 0; i < 8; i++)
- *(iv+4+i) = req->iv[i];
- *((__be32 *)(iv+12)) = counter;
-
- return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
- aes_ctx);
-}
#endif
-static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
+static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
@@ -898,108 +502,149 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
return aes_set_key_common(&ctx->tweak_ctx, key + keylen, keylen);
}
-static int xts_crypt(struct skcipher_request *req, bool encrypt)
+typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
+ u8 iv[AES_BLOCK_SIZE]);
+typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, unsigned int len,
+ u8 tweak[AES_BLOCK_SIZE]);
+
+/* This handles cases where the source and/or destination span pages. */
+static noinline int
+xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
+ const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
int tail = req->cryptlen % AES_BLOCK_SIZE;
+ struct scatterlist sg_src[2], sg_dst[2];
struct skcipher_request subreq;
struct skcipher_walk walk;
+ struct scatterlist *src, *dst;
int err;
- if (req->cryptlen < AES_BLOCK_SIZE)
- return -EINVAL;
-
- err = skcipher_walk_virt(&walk, req, false);
- if (!walk.nbytes)
- return err;
-
- if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
- int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
-
- skcipher_walk_abort(&walk);
-
+ /*
+ * If the message length isn't divisible by the AES block size, then
+ * separate off the last full block and the partial block. This ensures
+ * that they are processed in the same call to the assembly function,
+ * which is required for ciphertext stealing.
+ */
+ if (tail) {
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq,
skcipher_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(&subreq, req->src, req->dst,
- blocks * AES_BLOCK_SIZE, req->iv);
+ req->cryptlen - tail - AES_BLOCK_SIZE,
+ req->iv);
req = &subreq;
-
- err = skcipher_walk_virt(&walk, req, false);
- if (!walk.nbytes)
- return err;
- } else {
- tail = 0;
}
- kernel_fpu_begin();
+ err = skcipher_walk_virt(&walk, req, false);
- /* calculate first value of T */
- aesni_enc(&ctx->tweak_ctx, walk.iv, walk.iv);
+ while (walk.nbytes) {
+ kernel_fpu_begin();
+ (*crypt_func)(&ctx->crypt_ctx,
+ walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes & ~(AES_BLOCK_SIZE - 1), req->iv);
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk,
+ walk.nbytes & (AES_BLOCK_SIZE - 1));
+ }
- while (walk.nbytes > 0) {
- int nbytes = walk.nbytes;
+ if (err || !tail)
+ return err;
- if (nbytes < walk.total)
- nbytes &= ~(AES_BLOCK_SIZE - 1);
+ /* Do ciphertext stealing with the last full block and partial block. */
- if (encrypt)
- aesni_xts_encrypt(&ctx->crypt_ctx,
- walk.dst.virt.addr, walk.src.virt.addr,
- nbytes, walk.iv);
- else
- aesni_xts_decrypt(&ctx->crypt_ctx,
- walk.dst.virt.addr, walk.src.virt.addr,
- nbytes, walk.iv);
- kernel_fpu_end();
+ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+ req->iv);
- if (walk.nbytes > 0)
- kernel_fpu_begin();
- }
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ (*crypt_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, req->iv);
+ kernel_fpu_end();
- if (unlikely(tail > 0 && !err)) {
- struct scatterlist sg_src[2], sg_dst[2];
- struct scatterlist *src, *dst;
+ return skcipher_walk_done(&walk, 0);
+}
- dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
- if (req->dst != req->src)
- dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+/* __always_inline to avoid indirect call in fastpath */
+static __always_inline int
+xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv,
+ xts_crypt_func crypt_func)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
+ const unsigned int cryptlen = req->cryptlen;
+ struct scatterlist *src = req->src;
+ struct scatterlist *dst = req->dst;
- skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
- req->iv);
+ if (unlikely(cryptlen < AES_BLOCK_SIZE))
+ return -EINVAL;
- err = skcipher_walk_virt(&walk, &subreq, false);
- if (err)
- return err;
+ kernel_fpu_begin();
+ (*encrypt_iv)(&ctx->tweak_ctx, req->iv);
- kernel_fpu_begin();
- if (encrypt)
- aesni_xts_encrypt(&ctx->crypt_ctx,
- walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes, walk.iv);
- else
- aesni_xts_decrypt(&ctx->crypt_ctx,
- walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes, walk.iv);
+ /*
+ * In practice, virtually all XTS plaintexts and ciphertexts are either
+ * 512 or 4096 bytes, aligned such that they don't span page boundaries.
+ * To optimize the performance of these cases, and also any other case
+ * where no page boundary is spanned, the below fast-path handles
+ * single-page sources and destinations as efficiently as possible.
+ */
+ if (likely(src->length >= cryptlen && dst->length >= cryptlen &&
+ src->offset + cryptlen <= PAGE_SIZE &&
+ dst->offset + cryptlen <= PAGE_SIZE)) {
+ struct page *src_page = sg_page(src);
+ struct page *dst_page = sg_page(dst);
+ void *src_virt = kmap_local_page(src_page) + src->offset;
+ void *dst_virt = kmap_local_page(dst_page) + dst->offset;
+
+ (*crypt_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen,
+ req->iv);
+ kunmap_local(dst_virt);
+ kunmap_local(src_virt);
kernel_fpu_end();
-
- err = skcipher_walk_done(&walk, 0);
+ return 0;
}
- return err;
+ kernel_fpu_end();
+ return xts_crypt_slowpath(req, crypt_func);
}
-static int xts_encrypt(struct skcipher_request *req)
+static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+ u8 iv[AES_BLOCK_SIZE])
{
- return xts_crypt(req, true);
+ aesni_enc(tweak_key, iv, iv);
}
-static int xts_decrypt(struct skcipher_request *req)
+static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, unsigned int len,
+ u8 tweak[AES_BLOCK_SIZE])
{
- return xts_crypt(req, false);
+ aesni_xts_enc(key, dst, src, len, tweak);
+}
+
+static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, unsigned int len,
+ u8 tweak[AES_BLOCK_SIZE])
+{
+ aesni_xts_dec(key, dst, src, len, tweak);
+}
+
+static int xts_encrypt_aesni(struct skcipher_request *req)
+{
+ return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_encrypt);
+}
+
+static int xts_decrypt_aesni(struct skcipher_request *req)
+{
+ return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_decrypt);
}
static struct crypto_alg aesni_cipher_alg = {
@@ -1103,9 +748,9 @@ static struct skcipher_alg aesni_skciphers[] = {
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.walksize = 2 * AES_BLOCK_SIZE,
- .setkey = xts_aesni_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
+ .setkey = xts_setkey_aesni,
+ .encrypt = xts_encrypt_aesni,
+ .decrypt = xts_decrypt_aesni,
}
};
@@ -1137,90 +782,887 @@ static struct skcipher_alg aesni_xctr = {
};
static struct simd_skcipher_alg *aesni_simd_xctr;
-#endif /* CONFIG_X86_64 */
-#ifdef CONFIG_X86_64
-static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
- unsigned int key_len)
+asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+ u8 iv[AES_BLOCK_SIZE]);
+
+#define DEFINE_XTS_ALG(suffix, driver_name, priority) \
+ \
+asmlinkage void \
+aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
+ u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
+asmlinkage void \
+aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
+ u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
+ \
+static int xts_encrypt_##suffix(struct skcipher_request *req) \
+{ \
+ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix); \
+} \
+ \
+static int xts_decrypt_##suffix(struct skcipher_request *req) \
+{ \
+ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix); \
+} \
+ \
+static struct skcipher_alg aes_xts_alg_##suffix = { \
+ .base = { \
+ .cra_name = "__xts(aes)", \
+ .cra_driver_name = "__" driver_name, \
+ .cra_priority = priority, \
+ .cra_flags = CRYPTO_ALG_INTERNAL, \
+ .cra_blocksize = AES_BLOCK_SIZE, \
+ .cra_ctxsize = XTS_AES_CTX_SIZE, \
+ .cra_module = THIS_MODULE, \
+ }, \
+ .min_keysize = 2 * AES_MIN_KEY_SIZE, \
+ .max_keysize = 2 * AES_MAX_KEY_SIZE, \
+ .ivsize = AES_BLOCK_SIZE, \
+ .walksize = 2 * AES_BLOCK_SIZE, \
+ .setkey = xts_setkey_aesni, \
+ .encrypt = xts_encrypt_##suffix, \
+ .decrypt = xts_decrypt_##suffix, \
+}; \
+ \
+static struct simd_skcipher_alg *aes_xts_simdalg_##suffix
+
+DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500);
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600);
+DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700);
+DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800);
+#endif
+
+/* The common part of the x86_64 AES-GCM key struct */
+struct aes_gcm_key {
+ /* Expanded AES key and the AES key length in bytes */
+ struct crypto_aes_ctx aes_key;
+
+ /* RFC4106 nonce (used only by the rfc4106 algorithms) */
+ u32 rfc4106_nonce;
+};
+
+/* Key struct used by the AES-NI implementations of AES-GCM */
+struct aes_gcm_key_aesni {
+ /*
+ * Common part of the key. The assembly code requires 16-byte alignment
+ * for the round keys; we get this by them being located at the start of
+ * the struct and the whole struct being 16-byte aligned.
+ */
+ struct aes_gcm_key base;
+
+ /*
+ * Powers of the hash key H^8 through H^1. These are 128-bit values.
+ * They all have an extra factor of x^-1 and are byte-reversed. 16-byte
+ * alignment is required by the assembly code.
+ */
+ u64 h_powers[8][2] __aligned(16);
+
+ /*
+ * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd
+ * together. It's used for Karatsuba multiplication. 16-byte alignment
+ * is required by the assembly code.
+ */
+ u64 h_powers_xored[8] __aligned(16);
+
+ /*
+ * H^1 times x^64 (and also the usual extra factor of x^-1). 16-byte
+ * alignment is required by the assembly code.
+ */
+ u64 h_times_x64[2] __aligned(16);
+};
+#define AES_GCM_KEY_AESNI(key) \
+ container_of((key), struct aes_gcm_key_aesni, base)
+#define AES_GCM_KEY_AESNI_SIZE \
+ (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1)))
+
+/* Key struct used by the VAES + AVX10 implementations of AES-GCM */
+struct aes_gcm_key_avx10 {
+ /*
+ * Common part of the key. The assembly code prefers 16-byte alignment
+ * for the round keys; we get this by them being located at the start of
+ * the struct and the whole struct being 64-byte aligned.
+ */
+ struct aes_gcm_key base;
+
+ /*
+ * Powers of the hash key H^16 through H^1. These are 128-bit values.
+ * They all have an extra factor of x^-1 and are byte-reversed. This
+ * array is aligned to a 64-byte boundary to make it naturally aligned
+ * for 512-bit loads, which can improve performance. (The assembly code
+ * doesn't *need* the alignment; this is just an optimization.)
+ */
+ u64 h_powers[16][2] __aligned(64);
+
+ /* Three padding blocks required by the assembly code */
+ u64 padding[3][2];
+};
+#define AES_GCM_KEY_AVX10(key) \
+ container_of((key), struct aes_gcm_key_avx10, base)
+#define AES_GCM_KEY_AVX10_SIZE \
+ (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1)))
+
+/*
+ * These flags are passed to the AES-GCM helper functions to specify the
+ * specific version of AES-GCM (RFC4106 or not), whether it's encryption or
+ * decryption, and which assembly functions should be called. Assembly
+ * functions are selected using flags instead of function pointers to avoid
+ * indirect calls (which are very expensive on x86) regardless of inlining.
+ */
+#define FLAG_RFC4106 BIT(0)
+#define FLAG_ENC BIT(1)
+#define FLAG_AVX BIT(2)
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+# define FLAG_AVX10_256 BIT(3)
+# define FLAG_AVX10_512 BIT(4)
+#else
+ /*
+ * This should cause all calls to the AVX10 assembly functions to be
+ * optimized out, avoiding the need to ifdef each call individually.
+ */
+# define FLAG_AVX10_256 0
+# define FLAG_AVX10_512 0
+#endif
+
+static inline struct aes_gcm_key *
+aes_gcm_key_get(struct crypto_aead *tfm, int flags)
{
- struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead);
+ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+ return PTR_ALIGN(crypto_aead_ctx(tfm), 64);
+ else
+ return PTR_ALIGN(crypto_aead_ctx(tfm), 16);
+}
+
+asmlinkage void
+aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key);
+asmlinkage void
+aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key);
+asmlinkage void
+aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key);
+asmlinkage void
+aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key);
- return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?:
- rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+static void aes_gcm_precompute(struct aes_gcm_key *key, int flags)
+{
+ /*
+ * To make things a bit easier on the assembly side, the AVX10
+ * implementations use the same key format. Therefore, a single
+ * function using 256-bit vectors would suffice here. However, it's
+ * straightforward to provide a 512-bit one because of how the assembly
+ * code is structured, and it works nicely because the total size of the
+ * key powers is a multiple of 512 bits. So we take advantage of that.
+ *
+ * A similar situation applies to the AES-NI implementations.
+ */
+ if (flags & FLAG_AVX10_512)
+ aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key));
+ else if (flags & FLAG_AVX10_256)
+ aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key));
+ else if (flags & FLAG_AVX)
+ aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key));
+ else
+ aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key));
}
-static int generic_gcmaes_encrypt(struct aead_request *req)
+asmlinkage void
+aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
+asmlinkage void
+aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
+asmlinkage void
+aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+ u8 ghash_acc[16], const u8 *aad, int aadlen);
+
+static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16],
+ const u8 *aad, int aadlen, int flags)
{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
- void *aes_ctx = &(ctx->aes_key_expanded);
- u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
- u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
- __be32 counter = cpu_to_be32(1);
+ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+ aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc,
+ aad, aadlen);
+ else if (flags & FLAG_AVX)
+ aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc,
+ aad, aadlen);
+ else
+ aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc,
+ aad, aadlen);
+}
+
+asmlinkage void
+aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+
+asmlinkage void
+aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen);
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline void
+aes_gcm_update(const struct aes_gcm_key *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ const u8 *src, u8 *dst, int datalen, int flags)
+{
+ if (flags & FLAG_ENC) {
+ if (flags & FLAG_AVX10_512)
+ aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_AVX10_256)
+ aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_AVX)
+ aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else
+ aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr,
+ ghash_acc, src, dst, datalen);
+ } else {
+ if (flags & FLAG_AVX10_512)
+ aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_AVX10_256)
+ aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else if (flags & FLAG_AVX)
+ aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ else
+ aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ src, dst, datalen);
+ }
+}
+
+asmlinkage void
+aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
+asmlinkage void
+aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
+asmlinkage void
+aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen);
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline void
+aes_gcm_enc_final(const struct aes_gcm_key *key,
+ const u32 le_ctr[4], u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen, int flags)
+{
+ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+ aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
+ else if (flags & FLAG_AVX)
+ aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
+ else
+ aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen);
+}
+
+asmlinkage bool __must_check
+aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
+asmlinkage bool __must_check
+aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
+asmlinkage bool __must_check
+aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+ const u32 le_ctr[4], const u8 ghash_acc[16],
+ u64 total_aadlen, u64 total_datalen,
+ const u8 tag[16], int taglen);
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline bool __must_check
+aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4],
+ u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen,
+ u8 tag[16], int taglen, int flags)
+{
+ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+ return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
+ else if (flags & FLAG_AVX)
+ return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
+ else
+ return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key),
+ le_ctr, ghash_acc,
+ total_aadlen, total_datalen,
+ tag, taglen);
+}
+
+/*
+ * This is the Integrity Check Value (aka the authentication tag) length and can
+ * be 8, 12 or 16 bytes long.
+ */
+static int common_rfc4106_set_authsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ switch (authsize) {
+ case 8:
+ case 12:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ switch (authsize) {
+ case 4:
+ case 8:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * This is the setkey function for the x86_64 implementations of AES-GCM. It
+ * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes
+ * powers of the hash key.
+ *
+ * To comply with the crypto_aead API, this has to be usable in no-SIMD context.
+ * For that reason, this function includes a portable C implementation of the
+ * needed logic. However, the portable C implementation is very slow, taking
+ * about the same time as encrypting 37 KB of data. To be ready for users that
+ * may set a key even somewhat frequently, we therefore also include a SIMD
+ * assembly implementation, expanding the AES key using AES-NI and precomputing
+ * the hash key powers using PCLMULQDQ or VPCLMULQDQ.
+ */
+static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
+ unsigned int keylen, int flags)
+{
+ struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags);
+ int err;
+
+ if (flags & FLAG_RFC4106) {
+ if (keylen < 4)
+ return -EINVAL;
+ keylen -= 4;
+ key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen);
+ }
+
+ /* The assembly code assumes the following offsets. */
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_enc) != 0);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_length) != 480);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512);
+ BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768);
+
+ if (likely(crypto_simd_usable())) {
+ err = aes_check_keylen(keylen);
+ if (err)
+ return err;
+ kernel_fpu_begin();
+ aesni_set_key(&key->aes_key, raw_key, keylen);
+ aes_gcm_precompute(key, flags);
+ kernel_fpu_end();
+ } else {
+ static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = {
+ [0] = 0xc2, [15] = 1
+ };
+ static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = {
+ [7] = 1,
+ };
+ be128 h1 = {};
+ be128 h;
+ int i;
+
+ err = aes_expandkey(&key->aes_key, raw_key, keylen);
+ if (err)
+ return err;
+
+ /* Encrypt the all-zeroes block to get the hash key H^1 */
+ aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1);
- memcpy(iv, req->iv, 12);
- *((__be32 *)(iv+12)) = counter;
+ /* Compute H^1 * x^-1 */
+ h = h1;
+ gf128mul_lle(&h, (const be128 *)x_to_the_minus1);
+
+ /* Compute the needed key powers */
+ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) {
+ struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key);
+
+ for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+ k->h_powers[i][0] = be64_to_cpu(h.b);
+ k->h_powers[i][1] = be64_to_cpu(h.a);
+ gf128mul_lle(&h, &h1);
+ }
+ memset(k->padding, 0, sizeof(k->padding));
+ } else {
+ struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key);
+
+ for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+ k->h_powers[i][0] = be64_to_cpu(h.b);
+ k->h_powers[i][1] = be64_to_cpu(h.a);
+ k->h_powers_xored[i] = k->h_powers[i][0] ^
+ k->h_powers[i][1];
+ gf128mul_lle(&h, &h1);
+ }
+ gf128mul_lle(&h1, (const be128 *)x_to_the_63);
+ k->h_times_x64[0] = be64_to_cpu(h1.b);
+ k->h_times_x64[1] = be64_to_cpu(h1.a);
+ }
+ }
+ return 0;
+}
- return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv,
- aes_ctx);
+/*
+ * Initialize @ghash_acc, then pass all @assoclen bytes of associated data
+ * (a.k.a. additional authenticated data) from @sg_src through the GHASH update
+ * assembly function. kernel_fpu_begin() must have already been called.
+ */
+static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16],
+ struct scatterlist *sg_src, unsigned int assoclen,
+ int flags)
+{
+ struct scatter_walk walk;
+ /*
+ * The assembly function requires that the length of any non-last
+ * segment of associated data be a multiple of 16 bytes, so this
+ * function does the buffering needed to achieve that.
+ */
+ unsigned int pos = 0;
+ u8 buf[16];
+
+ memset(ghash_acc, 0, 16);
+ scatterwalk_start(&walk, sg_src);
+
+ while (assoclen) {
+ unsigned int len_this_page = scatterwalk_clamp(&walk, assoclen);
+ void *mapped = scatterwalk_map(&walk);
+ const void *src = mapped;
+ unsigned int len;
+
+ assoclen -= len_this_page;
+ scatterwalk_advance(&walk, len_this_page);
+ if (unlikely(pos)) {
+ len = min(len_this_page, 16 - pos);
+ memcpy(&buf[pos], src, len);
+ pos += len;
+ src += len;
+ len_this_page -= len;
+ if (pos < 16)
+ goto next;
+ aes_gcm_aad_update(key, ghash_acc, buf, 16, flags);
+ pos = 0;
+ }
+ len = len_this_page;
+ if (unlikely(assoclen)) /* Not the last segment yet? */
+ len = round_down(len, 16);
+ aes_gcm_aad_update(key, ghash_acc, src, len, flags);
+ src += len;
+ len_this_page -= len;
+ if (unlikely(len_this_page)) {
+ memcpy(buf, src, len_this_page);
+ pos = len_this_page;
+ }
+next:
+ scatterwalk_unmap(mapped);
+ scatterwalk_pagedone(&walk, 0, assoclen);
+ if (need_resched()) {
+ kernel_fpu_end();
+ kernel_fpu_begin();
+ }
+ }
+ if (unlikely(pos))
+ aes_gcm_aad_update(key, ghash_acc, buf, pos, flags);
}
-static int generic_gcmaes_decrypt(struct aead_request *req)
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline int
+gcm_crypt(struct aead_request *req, int flags)
{
- __be32 counter = cpu_to_be32(1);
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
- void *aes_ctx = &(ctx->aes_key_expanded);
- u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
- u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
+ const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags);
+ unsigned int assoclen = req->assoclen;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ u8 ghash_acc[16]; /* GHASH accumulator */
+ u32 le_ctr[4]; /* Counter in little-endian format */
+ int taglen;
+ int err;
- memcpy(iv, req->iv, 12);
- *((__be32 *)(iv+12)) = counter;
+ /* Initialize the counter and determine the associated data length. */
+ le_ctr[0] = 2;
+ if (flags & FLAG_RFC4106) {
+ if (unlikely(assoclen != 16 && assoclen != 20))
+ return -EINVAL;
+ assoclen -= 8;
+ le_ctr[1] = get_unaligned_be32(req->iv + 4);
+ le_ctr[2] = get_unaligned_be32(req->iv + 0);
+ le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */
+ } else {
+ le_ctr[1] = get_unaligned_be32(req->iv + 8);
+ le_ctr[2] = get_unaligned_be32(req->iv + 4);
+ le_ctr[3] = get_unaligned_be32(req->iv + 0);
+ }
- return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv,
- aes_ctx);
+ /* Begin walking through the plaintext or ciphertext. */
+ if (flags & FLAG_ENC)
+ err = skcipher_walk_aead_encrypt(&walk, req, false);
+ else
+ err = skcipher_walk_aead_decrypt(&walk, req, false);
+ if (err)
+ return err;
+
+ /*
+ * Since the AES-GCM assembly code requires that at least three assembly
+ * functions be called to process any message (this is needed to support
+ * incremental updates cleanly), to reduce overhead we try to do all
+ * three calls in the same kernel FPU section if possible. We close the
+ * section and start a new one if there are multiple data segments or if
+ * rescheduling is needed while processing the associated data.
+ */
+ kernel_fpu_begin();
+
+ /* Pass the associated data through GHASH. */
+ gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags);
+
+ /* En/decrypt the data and pass the ciphertext through GHASH. */
+ while (unlikely((nbytes = walk.nbytes) < walk.total)) {
+ /*
+ * Non-last segment. In this case, the assembly function
+ * requires that the length be a multiple of 16 (AES_BLOCK_SIZE)
+ * bytes. The needed buffering of up to 16 bytes is handled by
+ * the skcipher_walk. Here we just need to round down to a
+ * multiple of 16.
+ */
+ nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+ aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr,
+ walk.dst.virt.addr, nbytes, flags);
+ le_ctr[0] += nbytes / AES_BLOCK_SIZE;
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ if (err)
+ return err;
+ kernel_fpu_begin();
+ }
+ /* Last segment: process all remaining data. */
+ aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr,
+ walk.dst.virt.addr, nbytes, flags);
+ /*
+ * The low word of the counter isn't used by the finalize, so there's no
+ * need to increment it here.
+ */
+
+ /* Finalize */
+ taglen = crypto_aead_authsize(tfm);
+ if (flags & FLAG_ENC) {
+ /* Finish computing the auth tag. */
+ aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen,
+ req->cryptlen, flags);
+
+ /* Store the computed auth tag in the dst scatterlist. */
+ scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen +
+ req->cryptlen, taglen, 1);
+ } else {
+ unsigned int datalen = req->cryptlen - taglen;
+ u8 tag[16];
+
+ /* Get the transmitted auth tag from the src scatterlist. */
+ scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen,
+ taglen, 0);
+ /*
+ * Finish computing the auth tag and compare it to the
+ * transmitted one. The assembly function does the actual tag
+ * comparison. Here, just check the boolean result.
+ */
+ if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen,
+ datalen, tag, taglen, flags))
+ err = -EBADMSG;
+ }
+ kernel_fpu_end();
+ if (nbytes)
+ skcipher_walk_done(&walk, 0);
+ return err;
}
-static struct aead_alg aesni_aeads[] = { {
- .setkey = common_rfc4106_set_key,
- .setauthsize = common_rfc4106_set_authsize,
- .encrypt = helper_rfc4106_encrypt,
- .decrypt = helper_rfc4106_decrypt,
- .ivsize = GCM_RFC4106_IV_SIZE,
- .maxauthsize = 16,
- .base = {
- .cra_name = "__rfc4106(gcm(aes))",
- .cra_driver_name = "__rfc4106-gcm-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx),
- .cra_alignmask = 0,
- .cra_module = THIS_MODULE,
- },
-}, {
- .setkey = generic_gcmaes_set_key,
- .setauthsize = generic_gcmaes_set_authsize,
- .encrypt = generic_gcmaes_encrypt,
- .decrypt = generic_gcmaes_decrypt,
- .ivsize = GCM_AES_IV_SIZE,
- .maxauthsize = 16,
- .base = {
- .cra_name = "__gcm(aes)",
- .cra_driver_name = "__generic-gcm-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
- .cra_alignmask = 0,
- .cra_module = THIS_MODULE,
- },
-} };
-#else
-static struct aead_alg aesni_aeads[0];
+#define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name, \
+ ctxsize, priority) \
+ \
+static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \
+ unsigned int keylen) \
+{ \
+ return gcm_setkey(tfm, raw_key, keylen, (flags)); \
+} \
+ \
+static int gcm_encrypt_##suffix(struct aead_request *req) \
+{ \
+ return gcm_crypt(req, (flags) | FLAG_ENC); \
+} \
+ \
+static int gcm_decrypt_##suffix(struct aead_request *req) \
+{ \
+ return gcm_crypt(req, (flags)); \
+} \
+ \
+static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \
+ unsigned int keylen) \
+{ \
+ return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106); \
+} \
+ \
+static int rfc4106_encrypt_##suffix(struct aead_request *req) \
+{ \
+ return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC); \
+} \
+ \
+static int rfc4106_decrypt_##suffix(struct aead_request *req) \
+{ \
+ return gcm_crypt(req, (flags) | FLAG_RFC4106); \
+} \
+ \
+static struct aead_alg aes_gcm_algs_##suffix[] = { { \
+ .setkey = gcm_setkey_##suffix, \
+ .setauthsize = generic_gcmaes_set_authsize, \
+ .encrypt = gcm_encrypt_##suffix, \
+ .decrypt = gcm_decrypt_##suffix, \
+ .ivsize = GCM_AES_IV_SIZE, \
+ .chunksize = AES_BLOCK_SIZE, \
+ .maxauthsize = 16, \
+ .base = { \
+ .cra_name = "__gcm(aes)", \
+ .cra_driver_name = "__" generic_driver_name, \
+ .cra_priority = (priority), \
+ .cra_flags = CRYPTO_ALG_INTERNAL, \
+ .cra_blocksize = 1, \
+ .cra_ctxsize = (ctxsize), \
+ .cra_module = THIS_MODULE, \
+ }, \
+}, { \
+ .setkey = rfc4106_setkey_##suffix, \
+ .setauthsize = common_rfc4106_set_authsize, \
+ .encrypt = rfc4106_encrypt_##suffix, \
+ .decrypt = rfc4106_decrypt_##suffix, \
+ .ivsize = GCM_RFC4106_IV_SIZE, \
+ .chunksize = AES_BLOCK_SIZE, \
+ .maxauthsize = 16, \
+ .base = { \
+ .cra_name = "__rfc4106(gcm(aes))", \
+ .cra_driver_name = "__" rfc_driver_name, \
+ .cra_priority = (priority), \
+ .cra_flags = CRYPTO_ALG_INTERNAL, \
+ .cra_blocksize = 1, \
+ .cra_ctxsize = (ctxsize), \
+ .cra_module = THIS_MODULE, \
+ }, \
+} }; \
+ \
+static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2] \
+
+/* aes_gcm_algs_aesni */
+DEFINE_GCM_ALGS(aesni, /* no flags */ 0,
+ "generic-gcm-aesni", "rfc4106-gcm-aesni",
+ AES_GCM_KEY_AESNI_SIZE, 400);
+
+/* aes_gcm_algs_aesni_avx */
+DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX,
+ "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx",
+ AES_GCM_KEY_AESNI_SIZE, 500);
+
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+/* aes_gcm_algs_vaes_avx10_256 */
+DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256,
+ "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256",
+ AES_GCM_KEY_AVX10_SIZE, 700);
+
+/* aes_gcm_algs_vaes_avx10_512 */
+DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512,
+ "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512",
+ AES_GCM_KEY_AVX10_SIZE, 800);
+#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
+
+/*
+ * This is a list of CPU models that are known to suffer from downclocking when
+ * zmm registers (512-bit vectors) are used. On these CPUs, the AES mode
+ * implementations with zmm registers won't be used by default. Implementations
+ * with ymm registers (256-bit vectors) will be used by default instead.
+ */
+static const struct x86_cpu_id zmm_exclusion_list[] = {
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, 0),
+ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+ /* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */
+ {},
+};
+
+static int __init register_avx_algs(void)
+{
+ int err;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX))
+ return 0;
+ err = simd_register_skciphers_compat(&aes_xts_alg_aesni_avx, 1,
+ &aes_xts_simdalg_aesni_avx);
+ if (err)
+ return err;
+ err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx,
+ ARRAY_SIZE(aes_gcm_algs_aesni_avx),
+ aes_gcm_simdalgs_aesni_avx);
+ if (err)
+ return err;
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_VAES) ||
+ !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) ||
+ !boot_cpu_has(X86_FEATURE_PCLMULQDQ) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ return 0;
+ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx2, 1,
+ &aes_xts_simdalg_vaes_avx2);
+ if (err)
+ return err;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX512BW) ||
+ !boot_cpu_has(X86_FEATURE_AVX512VL) ||
+ !boot_cpu_has(X86_FEATURE_BMI2) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL))
+ return 0;
+
+ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_256, 1,
+ &aes_xts_simdalg_vaes_avx10_256);
+ if (err)
+ return err;
+ err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
+ aes_gcm_simdalgs_vaes_avx10_256);
+ if (err)
+ return err;
+
+ if (x86_match_cpu(zmm_exclusion_list)) {
+ int i;
+
+ aes_xts_alg_vaes_avx10_512.base.cra_priority = 1;
+ for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
+ aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
+ }
+
+ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1,
+ &aes_xts_simdalg_vaes_avx10_512);
+ if (err)
+ return err;
+ err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),
+ aes_gcm_simdalgs_vaes_avx10_512);
+ if (err)
+ return err;
+#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
+ return 0;
+}
+
+static void unregister_avx_algs(void)
+{
+ if (aes_xts_simdalg_aesni_avx)
+ simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1,
+ &aes_xts_simdalg_aesni_avx);
+ if (aes_gcm_simdalgs_aesni_avx[0])
+ simd_unregister_aeads(aes_gcm_algs_aesni_avx,
+ ARRAY_SIZE(aes_gcm_algs_aesni_avx),
+ aes_gcm_simdalgs_aesni_avx);
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+ if (aes_xts_simdalg_vaes_avx2)
+ simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1,
+ &aes_xts_simdalg_vaes_avx2);
+ if (aes_xts_simdalg_vaes_avx10_256)
+ simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1,
+ &aes_xts_simdalg_vaes_avx10_256);
+ if (aes_gcm_simdalgs_vaes_avx10_256[0])
+ simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
+ aes_gcm_simdalgs_vaes_avx10_256);
+ if (aes_xts_simdalg_vaes_avx10_512)
+ simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1,
+ &aes_xts_simdalg_vaes_avx10_512);
+ if (aes_gcm_simdalgs_vaes_avx10_512[0])
+ simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),
+ aes_gcm_simdalgs_vaes_avx10_512);
#endif
+}
+#else /* CONFIG_X86_64 */
+static struct aead_alg aes_gcm_algs_aesni[0];
+static struct simd_aead_alg *aes_gcm_simdalgs_aesni[0];
-static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)];
+static int __init register_avx_algs(void)
+{
+ return 0;
+}
+
+static void unregister_avx_algs(void)
+{
+}
+#endif /* !CONFIG_X86_64 */
static const struct x86_cpu_id aesni_cpu_id[] = {
X86_MATCH_FEATURE(X86_FEATURE_AES, NULL),
@@ -1235,17 +1677,6 @@ static int __init aesni_init(void)
if (!x86_match_cpu(aesni_cpu_id))
return -ENODEV;
#ifdef CONFIG_X86_64
- if (boot_cpu_has(X86_FEATURE_AVX2)) {
- pr_info("AVX2 version of gcm_enc/dec engaged.\n");
- static_branch_enable(&gcm_use_avx);
- static_branch_enable(&gcm_use_avx2);
- } else
- if (boot_cpu_has(X86_FEATURE_AVX)) {
- pr_info("AVX version of gcm_enc/dec engaged.\n");
- static_branch_enable(&gcm_use_avx);
- } else {
- pr_info("SSE version of gcm_enc/dec engaged.\n");
- }
if (boot_cpu_has(X86_FEATURE_AVX)) {
/* optimize performance of ctr mode encryption transform */
static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm);
@@ -1263,8 +1694,9 @@ static int __init aesni_init(void)
if (err)
goto unregister_cipher;
- err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads),
- aesni_simd_aeads);
+ err = simd_register_aeads_compat(aes_gcm_algs_aesni,
+ ARRAY_SIZE(aes_gcm_algs_aesni),
+ aes_gcm_simdalgs_aesni);
if (err)
goto unregister_skciphers;
@@ -1276,14 +1708,22 @@ static int __init aesni_init(void)
goto unregister_aeads;
#endif /* CONFIG_X86_64 */
+ err = register_avx_algs();
+ if (err)
+ goto unregister_avx;
+
return 0;
+unregister_avx:
+ unregister_avx_algs();
#ifdef CONFIG_X86_64
+ if (aesni_simd_xctr)
+ simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
unregister_aeads:
- simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
- aesni_simd_aeads);
#endif /* CONFIG_X86_64 */
-
+ simd_unregister_aeads(aes_gcm_algs_aesni,
+ ARRAY_SIZE(aes_gcm_algs_aesni),
+ aes_gcm_simdalgs_aesni);
unregister_skciphers:
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
aesni_simd_skciphers);
@@ -1294,8 +1734,9 @@ unregister_cipher:
static void __exit aesni_exit(void)
{
- simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
- aesni_simd_aeads);
+ simd_unregister_aeads(aes_gcm_algs_aesni,
+ ARRAY_SIZE(aes_gcm_algs_aesni),
+ aes_gcm_simdalgs_aesni);
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
aesni_simd_skciphers);
crypto_unregister_alg(&aesni_cipher_alg);
@@ -1303,11 +1744,12 @@ static void __exit aesni_exit(void)
if (boot_cpu_has(X86_FEATURE_AVX))
simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
#endif /* CONFIG_X86_64 */
+ unregister_avx_algs();
}
late_initcall(aesni_init);
module_exit(aesni_exit);
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
+MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("aes");
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index d45e9c0c42ac..f110708c8038 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -8,7 +8,7 @@
* Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation)
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/crypto.h>
#include <linux/init.h>
#include <linux/module.h>
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 98cf3b4e4c9f..9f5e342b9845 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -195,6 +195,7 @@ module_init(crc32_pclmul_mod_init);
module_exit(crc32_pclmul_mod_fini);
MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>");
+MODULE_DESCRIPTION("CRC32 algorithm (IEEE 802.3) accelerated with PCLMULQDQ");
MODULE_LICENSE("GPL");
MODULE_ALIAS_CRYPTO("crc32");
diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index d55fa9e9b9e6..dcfc0de333de 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -1720,5 +1720,6 @@ module_exit(curve25519_mod_exit);
MODULE_ALIAS_CRYPTO("curve25519");
MODULE_ALIAS_CRYPTO("curve25519-x86");
+MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 700ecaee9a08..41bc02e48916 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -19,7 +19,7 @@
#include <crypto/internal/simd.h>
#include <asm/cpu_device_id.h>
#include <asm/simd.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#define GHASH_BLOCK_SIZE 16
#define GHASH_DIGEST_SIZE 16
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
index ef73a3ab8726..791386d9a83a 100644
--- a/arch/x86/crypto/nh-avx2-x86_64.S
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -154,5 +154,6 @@ SYM_TYPED_FUNC_START(nh_avx2)
vpaddq T1, T0, T0
vpaddq T4, T0, T0
vmovdqu T0, (HASH)
+ vzeroupper
RET
SYM_FUNC_END(nh_avx2)
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 1dfb8af48a3c..08ff4b489f7e 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -12,7 +12,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sizes.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/simd.h>
asmlinkage void poly1305_init_x86_64(void *ctx,
@@ -269,7 +269,7 @@ static int __init poly1305_simd_mod_init(void)
boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
/* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
- boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
+ boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
static_branch_enable(&poly1305_use_avx512);
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
}
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 9918212faf91..0bbec1c75cd0 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -592,22 +592,22 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx)
leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED _XFER + 0*32
+ FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
leaq K256+1*32(%rip), INP
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED _XFER + 1*32
+ FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
leaq K256+2*32(%rip), INP
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED _XFER + 2*32
+ FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
leaq K256+3*32(%rip), INP
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED _XFER + 3*32
+ FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
add $4*32, SRND
cmp $3*4*32, SRND
@@ -618,12 +618,12 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx)
leaq K256+0*32(%rip), INP
vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- DO_4ROUNDS _XFER + 0*32
+ DO_4ROUNDS (_XFER + 0*32)
leaq K256+1*32(%rip), INP
vpaddd (INP, SRND), X1, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- DO_4ROUNDS _XFER + 1*32
+ DO_4ROUNDS (_XFER + 1*32)
add $2*32, SRND
vmovdqa X2, X0
@@ -651,8 +651,8 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx)
xor SRND, SRND
.align 16
.Lloop3:
- DO_4ROUNDS _XFER + 0*32 + 16
- DO_4ROUNDS _XFER + 1*32 + 16
+ DO_4ROUNDS (_XFER + 0*32 + 16)
+ DO_4ROUNDS (_XFER + 1*32 + 16)
add $2*32, SRND
cmp $4*4*32, SRND
jb .Lloop3
@@ -716,6 +716,7 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx)
popq %r13
popq %r12
popq %rbx
+ vzeroupper
RET
SYM_FUNC_END(sha256_transform_rorx)
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 537b6dcd7ed8..d515a55a3bc1 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -62,20 +62,41 @@
#define SHA256CONSTANTS %rax
-#define MSG %xmm0
+#define MSG %xmm0 /* sha256rnds2 implicit operand */
#define STATE0 %xmm1
#define STATE1 %xmm2
-#define MSGTMP0 %xmm3
-#define MSGTMP1 %xmm4
-#define MSGTMP2 %xmm5
-#define MSGTMP3 %xmm6
-#define MSGTMP4 %xmm7
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define TMP %xmm7
#define SHUF_MASK %xmm8
#define ABEF_SAVE %xmm9
#define CDGH_SAVE %xmm10
+.macro do_4rounds i, m0, m1, m2, m3
+.if \i < 16
+ movdqu \i*4(DATA_PTR), \m0
+ pshufb SHUF_MASK, \m0
+.endif
+ movdqa (\i-32)*4(SHA256CONSTANTS), MSG
+ paddd \m0, MSG
+ sha256rnds2 STATE0, STATE1
+.if \i >= 12 && \i < 60
+ movdqa \m0, TMP
+ palignr $4, \m3, TMP
+ paddd TMP, \m1
+ sha256msg2 \m0, \m1
+.endif
+ punpckhqdq MSG, MSG
+ sha256rnds2 STATE1, STATE0
+.if \i >= 4 && \i < 52
+ sha256msg1 \m0, \m3
+.endif
+.endm
+
/*
* Intel SHA Extensions optimized implementation of a SHA-256 update function
*
@@ -86,9 +107,6 @@
* store partial blocks. All message padding and hash value initialization must
* be done outside the update function.
*
- * The indented lines in the loop are instructions related to rounds processing.
- * The non-indented lines are instructions related to the message schedule.
- *
* void sha256_ni_transform(uint32_t *digest, const void *data,
uint32_t numBlocks);
* digest : pointer to digest
@@ -108,202 +126,29 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
* Need to reorder these appropriately
* DCBA, HGFE -> ABEF, CDGH
*/
- movdqu 0*16(DIGEST_PTR), STATE0
- movdqu 1*16(DIGEST_PTR), STATE1
+ movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */
+ movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */
- pshufd $0xB1, STATE0, STATE0 /* CDAB */
- pshufd $0x1B, STATE1, STATE1 /* EFGH */
- movdqa STATE0, MSGTMP4
- palignr $8, STATE1, STATE0 /* ABEF */
- pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
+ movdqa STATE0, TMP
+ punpcklqdq STATE1, STATE0 /* FEBA */
+ punpckhqdq TMP, STATE1 /* DCHG */
+ pshufd $0x1B, STATE0, STATE0 /* ABEF */
+ pshufd $0xB1, STATE1, STATE1 /* CDGH */
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
- lea K256(%rip), SHA256CONSTANTS
+ lea K256+32*4(%rip), SHA256CONSTANTS
.Lloop0:
/* Save hash values for addition after rounds */
movdqa STATE0, ABEF_SAVE
movdqa STATE1, CDGH_SAVE
- /* Rounds 0-3 */
- movdqu 0*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP0
- paddd 0*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
-
- /* Rounds 4-7 */
- movdqu 1*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP1
- paddd 1*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP1, MSGTMP0
-
- /* Rounds 8-11 */
- movdqu 2*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP2
- paddd 2*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP2, MSGTMP1
-
- /* Rounds 12-15 */
- movdqu 3*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP3
- paddd 3*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP3, MSGTMP4
- palignr $4, MSGTMP2, MSGTMP4
- paddd MSGTMP4, MSGTMP0
- sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP3, MSGTMP2
-
- /* Rounds 16-19 */
- movdqa MSGTMP0, MSG
- paddd 4*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP0, MSGTMP4
- palignr $4, MSGTMP3, MSGTMP4
- paddd MSGTMP4, MSGTMP1
- sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP0, MSGTMP3
-
- /* Rounds 20-23 */
- movdqa MSGTMP1, MSG
- paddd 5*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP1, MSGTMP4
- palignr $4, MSGTMP0, MSGTMP4
- paddd MSGTMP4, MSGTMP2
- sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP1, MSGTMP0
-
- /* Rounds 24-27 */
- movdqa MSGTMP2, MSG
- paddd 6*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP2, MSGTMP4
- palignr $4, MSGTMP1, MSGTMP4
- paddd MSGTMP4, MSGTMP3
- sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP2, MSGTMP1
-
- /* Rounds 28-31 */
- movdqa MSGTMP3, MSG
- paddd 7*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP3, MSGTMP4
- palignr $4, MSGTMP2, MSGTMP4
- paddd MSGTMP4, MSGTMP0
- sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP3, MSGTMP2
-
- /* Rounds 32-35 */
- movdqa MSGTMP0, MSG
- paddd 8*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP0, MSGTMP4
- palignr $4, MSGTMP3, MSGTMP4
- paddd MSGTMP4, MSGTMP1
- sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP0, MSGTMP3
-
- /* Rounds 36-39 */
- movdqa MSGTMP1, MSG
- paddd 9*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP1, MSGTMP4
- palignr $4, MSGTMP0, MSGTMP4
- paddd MSGTMP4, MSGTMP2
- sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP1, MSGTMP0
-
- /* Rounds 40-43 */
- movdqa MSGTMP2, MSG
- paddd 10*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP2, MSGTMP4
- palignr $4, MSGTMP1, MSGTMP4
- paddd MSGTMP4, MSGTMP3
- sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP2, MSGTMP1
-
- /* Rounds 44-47 */
- movdqa MSGTMP3, MSG
- paddd 11*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP3, MSGTMP4
- palignr $4, MSGTMP2, MSGTMP4
- paddd MSGTMP4, MSGTMP0
- sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP3, MSGTMP2
-
- /* Rounds 48-51 */
- movdqa MSGTMP0, MSG
- paddd 12*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP0, MSGTMP4
- palignr $4, MSGTMP3, MSGTMP4
- paddd MSGTMP4, MSGTMP1
- sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP0, MSGTMP3
-
- /* Rounds 52-55 */
- movdqa MSGTMP1, MSG
- paddd 13*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP1, MSGTMP4
- palignr $4, MSGTMP0, MSGTMP4
- paddd MSGTMP4, MSGTMP2
- sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
-
- /* Rounds 56-59 */
- movdqa MSGTMP2, MSG
- paddd 14*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP2, MSGTMP4
- palignr $4, MSGTMP1, MSGTMP4
- paddd MSGTMP4, MSGTMP3
- sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
-
- /* Rounds 60-63 */
- movdqa MSGTMP3, MSG
- paddd 15*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
+.irp i, 0, 16, 32, 48
+ do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
+ do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
+ do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
+ do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
+.endr
/* Add current hash values with previously saved */
paddd ABEF_SAVE, STATE0
@@ -315,14 +160,14 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
jne .Lloop0
/* Write hash values back in the correct order */
- pshufd $0x1B, STATE0, STATE0 /* FEBA */
- pshufd $0xB1, STATE1, STATE1 /* DCHG */
- movdqa STATE0, MSGTMP4
- pblendw $0xF0, STATE1, STATE0 /* DCBA */
- palignr $8, MSGTMP4, STATE1 /* HGFE */
-
- movdqu STATE0, 0*16(DIGEST_PTR)
- movdqu STATE1, 1*16(DIGEST_PTR)
+ movdqa STATE0, TMP
+ punpcklqdq STATE1, STATE0 /* GHEF */
+ punpckhqdq TMP, STATE1 /* ABCD */
+ pshufd $0xB1, STATE0, STATE0 /* HGFE */
+ pshufd $0x1B, STATE1, STATE1 /* DCBA */
+
+ movdqu STATE1, 0*16(DIGEST_PTR)
+ movdqu STATE0, 1*16(DIGEST_PTR)
.Ldone_hash:
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index f08496cd6870..24973f42c43f 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -680,6 +680,7 @@ SYM_TYPED_FUNC_START(sha512_transform_rorx)
pop %r12
pop %rbx
+ vzeroupper
RET
SYM_FUNC_END(sha512_transform_rorx)
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 90454cf18e0d..1a1ecfa7f72a 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -5,6 +5,7 @@
* Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*/
+#include <asm/cpu_device_id.h>
#include <crypto/algapi.h>
#include <crypto/twofish.h>
#include <linux/crypto.h>
@@ -107,10 +108,10 @@ static bool is_blacklisted_cpu(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return false;
- if (boot_cpu_data.x86 == 0x06 &&
- (boot_cpu_data.x86_model == 0x1c ||
- boot_cpu_data.x86_model == 0x26 ||
- boot_cpu_data.x86_model == 0x36)) {
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_BONNELL:
+ case INTEL_ATOM_BONNELL_MID:
+ case INTEL_ATOM_SALTWELL:
/*
* On Atom, twofish-3way is slower than original assembler
* implementation. Twofish-3way trades off some performance in
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index c93e7f5c2a06..ce1cc1622385 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -17,7 +17,7 @@ obj-y += common.o
obj-y += vdso/
obj-y += vsyscall/
-obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o
+obj-$(CONFIG_PREEMPTION) += thunk.o
CFLAGS_entry_fred.o += -fno-stack-protector
CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE)
obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 51cc9c7cb9bd..94941c5a10ac 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -150,7 +150,7 @@ early_param("ia32_emulation", ia32_emulation_override_cmdline);
#endif
/*
- * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
+ * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL.
*/
static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
{
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index c779046cc3fe..ed0a5f2dc129 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -7,7 +7,6 @@
#include <asm/asm-offsets.h>
#include <asm/current.h>
#include <asm/errno.h>
-#include <asm/ia32_unistd.h>
#include <asm/thread_info.h>
#include <asm/segment.h>
#include <asm/irqflags.h>
@@ -90,10 +89,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
cld
- IBRS_ENTER
- UNTRAIN_RET
- CLEAR_BRANCH_HISTORY
-
/*
* SYSENTER doesn't filter flags, so we need to clear NT and AC
* ourselves. To save a few cycles, we can check whether
@@ -117,6 +112,16 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed:
+ /*
+ * CPU bugs mitigations mechanisms can call other functions. They
+ * should be invoked after making sure TF is cleared because
+ * single-step is ignored only for instructions inside the
+ * entry_SYSENTER_compat function.
+ */
+ IBRS_ENTER
+ UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
+
movq %rsp, %rdi
call do_SYSENTER_32
jmp sysret32_from_system_call
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index 89c1476fcdd9..f004a4dc74c2 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -117,6 +117,8 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
+
+ SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, posted_msi_notification),
};
static bool fred_setup_done __initdata;
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index c2235bae17ef..8cc9950d7104 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -14,9 +14,12 @@
#endif
#define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
-
+#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __ia32_##sym(const struct pt_regs *);
#include <asm/syscalls_32.h>
-#undef __SYSCALL
+#undef __SYSCALL
+
+#undef __SYSCALL_NORETURN
+#define __SYSCALL_NORETURN __SYSCALL
/*
* The sys_call_table[] is no longer used for system calls, but
@@ -28,11 +31,10 @@
const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_32.h>
};
-#undef __SYSCALL
+#undef __SYSCALL
#endif
#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs);
-
long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
{
switch (nr) {
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 33b3f09e6f15..ba8354424860 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -8,8 +8,12 @@
#include <asm/syscall.h>
#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
+#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_64.h>
-#undef __SYSCALL
+#undef __SYSCALL
+
+#undef __SYSCALL_NORETURN
+#define __SYSCALL_NORETURN __SYSCALL
/*
* The sys_call_table[] is no longer used for system calls, but
@@ -20,10 +24,9 @@
const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_64.h>
};
-#undef __SYSCALL
+#undef __SYSCALL
#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-
long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
{
switch (nr) {
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index 03de4a932131..fb77908f44f3 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -8,11 +8,14 @@
#include <asm/syscall.h>
#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
+#define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_x32.h>
-#undef __SYSCALL
+#undef __SYSCALL
-#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
+#undef __SYSCALL_NORETURN
+#define __SYSCALL_NORETURN __SYSCALL
+#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
{
switch (nr) {
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 5f8591ce7f25..534c74b14fab 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -1,8 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
#
# 32-bit system call numbers and entry vectors
#
# The format is:
-# <number> <abi> <name> <entry point> <compat entry point>
+# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]]
#
# The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for
# sys_*() system calls and compat_sys_*() compat system calls if
@@ -12,7 +13,7 @@
# The abi is always "i386" for this file.
#
0 i386 restart_syscall sys_restart_syscall
-1 i386 exit sys_exit
+1 i386 exit sys_exit - noreturn
2 i386 fork sys_fork
3 i386 read sys_read
4 i386 write sys_write
@@ -263,7 +264,7 @@
249 i386 io_cancel sys_io_cancel
250 i386 fadvise64 sys_ia32_fadvise64
# 251 is available for reuse (was briefly sys_set_zone_reclaim)
-252 i386 exit_group sys_exit_group
+252 i386 exit_group sys_exit_group - noreturn
253 i386 lookup_dcookie
254 i386 epoll_create sys_epoll_create
255 i386 epoll_ctl sys_epoll_ctl
@@ -420,7 +421,7 @@
412 i386 utimensat_time64 sys_utimensat
413 i386 pselect6_time64 sys_pselect6 compat_sys_pselect6_time64
414 i386 ppoll_time64 sys_ppoll compat_sys_ppoll_time64
-416 i386 io_pgetevents_time64 sys_io_pgetevents
+416 i386 io_pgetevents_time64 sys_io_pgetevents compat_sys_io_pgetevents_time64
417 i386 recvmmsg_time64 sys_recvmmsg compat_sys_recvmmsg_time64
418 i386 mq_timedsend_time64 sys_mq_timedsend
419 i386 mq_timedreceive_time64 sys_mq_timedreceive
@@ -466,3 +467,4 @@
459 i386 lsm_get_self_attr sys_lsm_get_self_attr
460 i386 lsm_set_self_attr sys_lsm_set_self_attr
461 i386 lsm_list_modules sys_lsm_list_modules
+462 i386 mseal sys_mseal
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7e8d46f4147f..7093ee21c0d1 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -1,8 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
#
# 64-bit system call numbers and entry vectors
#
# The format is:
-# <number> <abi> <name> <entry point>
+# <number> <abi> <name> <entry point> [<compat entry point> [noreturn]]
#
# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
#
@@ -68,7 +69,7 @@
57 common fork sys_fork
58 common vfork sys_vfork
59 64 execve sys_execve
-60 common exit sys_exit
+60 common exit sys_exit - noreturn
61 common wait4 sys_wait4
62 common kill sys_kill
63 common uname sys_newuname
@@ -239,7 +240,7 @@
228 common clock_gettime sys_clock_gettime
229 common clock_getres sys_clock_getres
230 common clock_nanosleep sys_clock_nanosleep
-231 common exit_group sys_exit_group
+231 common exit_group sys_exit_group - noreturn
232 common epoll_wait sys_epoll_wait
233 common epoll_ctl sys_epoll_ctl
234 common tgkill sys_tgkill
@@ -343,6 +344,7 @@
332 common statx sys_statx
333 common io_pgetevents sys_io_pgetevents
334 common rseq sys_rseq
+335 common uretprobe sys_uretprobe
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
424 common pidfd_send_signal sys_pidfd_send_signal
@@ -374,7 +376,7 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
-453 64 map_shadow_stack sys_map_shadow_stack
+453 common map_shadow_stack sys_map_shadow_stack
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
@@ -383,6 +385,7 @@
459 common lsm_get_self_attr sys_lsm_get_self_attr
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
+462 common mseal sys_mseal
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk.S
index 119ebdc3d362..119ebdc3d362 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk.S
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
deleted file mode 100644
index da37f42f4549..000000000000
--- a/arch/x86/entry/thunk_32.S
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
- * Copyright 2008 by Steven Rostedt, Red Hat, Inc
- * (inspired by Andi Kleen's thunk_64.S)
- */
-
-#include <linux/export.h>
-#include <linux/linkage.h>
-#include <asm/asm.h>
-
-#include "calling.h"
-
-THUNK preempt_schedule_thunk, preempt_schedule
-THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
-EXPORT_SYMBOL(preempt_schedule_thunk)
-EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
-
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 3d64bcc403cf..c9216ac4fb1e 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -6,45 +6,20 @@
# Include the generic Makefile to check the built vDSO:
include $(srctree)/lib/vdso/Makefile
-# Sanitizer runtimes are unavailable and cannot be linked here.
-KASAN_SANITIZE := n
-KMSAN_SANITIZE_vclock_gettime.o := n
-KMSAN_SANITIZE_vdso32/vclock_gettime.o := n
-KMSAN_SANITIZE_vgetcpu.o := n
-KMSAN_SANITIZE_vdso32/vgetcpu.o := n
-
-UBSAN_SANITIZE := n
-KCSAN_SANITIZE := n
-OBJECT_FILES_NON_STANDARD := y
-
-# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
-KCOV_INSTRUMENT := n
-
# Files to link into the vDSO:
-vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vgetrandom.o vgetrandom-chacha.o
vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o
vobjs-$(CONFIG_X86_SGX) += vsgx.o
# Files to link into the kernel:
obj-y += vma.o extable.o
-KASAN_SANITIZE_vma.o := y
-UBSAN_SANITIZE_vma.o := y
-KCSAN_SANITIZE_vma.o := y
-
-OBJECT_FILES_NON_STANDARD_vma.o := n
-OBJECT_FILES_NON_STANDARD_extable.o := n
# vDSO images to build:
obj-$(CONFIG_X86_64) += vdso-image-64.o
obj-$(CONFIG_X86_X32_ABI) += vdso-image-x32.o
obj-$(CONFIG_COMPAT_32) += vdso-image-32.o vdso32-setup.o
-OBJECT_FILES_NON_STANDARD_vdso-image-32.o := n
-OBJECT_FILES_NON_STANDARD_vdso-image-x32.o := n
-OBJECT_FILES_NON_STANDARD_vdso-image-64.o := n
-OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n
-
vobjs := $(addprefix $(obj)/, $(vobjs-y))
vobjs32 := $(addprefix $(obj)/, $(vobjs32-y))
@@ -98,6 +73,7 @@ CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
CFLAGS_REMOVE_vgetcpu.o = -pg
CFLAGS_REMOVE_vdso32/vgetcpu.o = -pg
CFLAGS_REMOVE_vsgx.o = -pg
+CFLAGS_REMOVE_vgetrandom.o = -pg
#
# X32 processes use x32 vDSO to access 64bit kernel data.
@@ -176,11 +152,10 @@ quiet_cmd_vdso = VDSO $@
cmd_vdso = $(LD) -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
-T $(filter %.lds,$^) $(filter %.o,$^) && \
- sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
+ sh $(src)/checkundef.sh '$(NM)' '$@'
VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 \
$(call ld-option, --eh-frame-hdr) -Bsymbolic -z noexecstack
-GCOV_PROFILE := n
quiet_cmd_vdso_and_check = VDSO $@
cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check)
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index e8c60ae7a7c8..0bab5f4af6d1 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -30,6 +30,8 @@ VERSION {
#ifdef CONFIG_X86_SGX
__vdso_sgx_enter_enclave;
#endif
+ getrandom;
+ __vdso_getrandom;
local: *;
};
}
diff --git a/arch/x86/entry/vdso/vgetrandom-chacha.S b/arch/x86/entry/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..bcba5639b8ee
--- /dev/null
+++ b/arch/x86/entry/vdso/vgetrandom-chacha.S
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.section .rodata, "a"
+.align 16
+CONSTANTS: .octa 0x6b20657479622d323320646e61707865
+.text
+
+/*
+ * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
+ * of blocks of output with a nonce of 0, taking an input key and 8-byte
+ * counter. Importantly does not spill to the stack. Its arguments are:
+ *
+ * rdi: output bytes
+ * rsi: 32-byte key input
+ * rdx: 8-byte counter input/output
+ * rcx: number of 64-byte blocks to write to output
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+
+.set output, %rdi
+.set key, %rsi
+.set counter, %rdx
+.set nblocks, %rcx
+.set i, %al
+/* xmm registers are *not* callee-save. */
+.set temp, %xmm0
+.set state0, %xmm1
+.set state1, %xmm2
+.set state2, %xmm3
+.set state3, %xmm4
+.set copy0, %xmm5
+.set copy1, %xmm6
+.set copy2, %xmm7
+.set copy3, %xmm8
+.set one, %xmm9
+
+ /* copy0 = "expand 32-byte k" */
+ movaps CONSTANTS(%rip),copy0
+ /* copy1,copy2 = key */
+ movups 0x00(key),copy1
+ movups 0x10(key),copy2
+ /* copy3 = counter || zero nonce */
+ movq 0x00(counter),copy3
+ /* one = 1 || 0 */
+ movq $1,%rax
+ movq %rax,one
+
+.Lblock:
+ /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
+ movdqa copy0,state0
+ movdqa copy1,state1
+ movdqa copy2,state2
+ movdqa copy3,state3
+
+ movb $10,i
+.Lpermute:
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+ paddd state1,state0
+ pxor state0,state3
+ movdqa state3,temp
+ pslld $16,temp
+ psrld $16,state3
+ por temp,state3
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+ paddd state3,state2
+ pxor state2,state1
+ movdqa state1,temp
+ pslld $12,temp
+ psrld $20,state1
+ por temp,state1
+
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+ paddd state1,state0
+ pxor state0,state3
+ movdqa state3,temp
+ pslld $8,temp
+ psrld $24,state3
+ por temp,state3
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+ paddd state3,state2
+ pxor state2,state1
+ movdqa state1,temp
+ pslld $7,temp
+ psrld $25,state1
+ por temp,state1
+
+ /* state1[0,1,2,3] = state1[1,2,3,0] */
+ pshufd $0x39,state1,state1
+ /* state2[0,1,2,3] = state2[2,3,0,1] */
+ pshufd $0x4e,state2,state2
+ /* state3[0,1,2,3] = state3[3,0,1,2] */
+ pshufd $0x93,state3,state3
+
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
+ paddd state1,state0
+ pxor state0,state3
+ movdqa state3,temp
+ pslld $16,temp
+ psrld $16,state3
+ por temp,state3
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
+ paddd state3,state2
+ pxor state2,state1
+ movdqa state1,temp
+ pslld $12,temp
+ psrld $20,state1
+ por temp,state1
+
+ /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
+ paddd state1,state0
+ pxor state0,state3
+ movdqa state3,temp
+ pslld $8,temp
+ psrld $24,state3
+ por temp,state3
+
+ /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
+ paddd state3,state2
+ pxor state2,state1
+ movdqa state1,temp
+ pslld $7,temp
+ psrld $25,state1
+ por temp,state1
+
+ /* state1[0,1,2,3] = state1[3,0,1,2] */
+ pshufd $0x93,state1,state1
+ /* state2[0,1,2,3] = state2[2,3,0,1] */
+ pshufd $0x4e,state2,state2
+ /* state3[0,1,2,3] = state3[1,2,3,0] */
+ pshufd $0x39,state3,state3
+
+ decb i
+ jnz .Lpermute
+
+ /* output0 = state0 + copy0 */
+ paddd copy0,state0
+ movups state0,0x00(output)
+ /* output1 = state1 + copy1 */
+ paddd copy1,state1
+ movups state1,0x10(output)
+ /* output2 = state2 + copy2 */
+ paddd copy2,state2
+ movups state2,0x20(output)
+ /* output3 = state3 + copy3 */
+ paddd copy3,state3
+ movups state3,0x30(output)
+
+ /* ++copy3.counter */
+ paddq one,copy3
+
+ /* output += 64, --nblocks */
+ addq $64,output
+ decq nblocks
+ jnz .Lblock
+
+ /* counter = copy3.counter */
+ movq copy3,0x00(counter)
+
+ /* Zero out the potentially sensitive regs, in case nothing uses these again. */
+ pxor state0,state0
+ pxor state1,state1
+ pxor state2,state2
+ pxor state3,state3
+ pxor copy1,copy1
+ pxor copy2,copy2
+ pxor temp,temp
+
+ ret
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
diff --git a/arch/x86/entry/vdso/vgetrandom.c b/arch/x86/entry/vdso/vgetrandom.c
new file mode 100644
index 000000000000..430862b8977c
--- /dev/null
+++ b/arch/x86/entry/vdso/vgetrandom.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+#include <linux/types.h>
+
+#include "../../../../lib/vdso/getrandom.c"
+
+ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
+{
+ return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
+}
+
+ssize_t getrandom(void *, size_t, unsigned int, void *, size_t)
+ __attribute__((weak, alias("__vdso_getrandom")));
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 6d83ceb7f1ba..b8fed8b8b9cc 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -38,6 +38,9 @@ struct vdso_data *arch_get_vdso_data(void *vvar_page)
}
#undef EMIT_VVAR
+DEFINE_VVAR(struct vdso_data, _vdso_data);
+DEFINE_VVAR_SINGLE(struct vdso_rng_data, _vdso_rng_data);
+
unsigned int vclocks_used __read_mostly;
#if defined(CONFIG_X86_64)
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 985ef3b47919..920e3a640cad 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -432,8 +432,10 @@ static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
* be removed on one CPU at a time AND PMU is disabled
* when we come here
*/
- for (i = 0; i < x86_pmu.num_counters; i++) {
- if (cmpxchg(nb->owners + i, event, NULL) == event)
+ for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
+ struct perf_event *tmp = event;
+
+ if (try_cmpxchg(nb->owners + i, &tmp, NULL))
break;
}
}
@@ -499,7 +501,7 @@ __amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *ev
* because of successive calls to x86_schedule_events() from
* hw_perf_group_sched_in() without hw_perf_enable()
*/
- for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
+ for_each_set_bit(idx, c->idxmsk, x86_pmu_max_num_counters(NULL)) {
if (new == -1 || hwc->idx == idx)
/* assign free slot, prefer hwc->idx */
old = cmpxchg(nb->owners + idx, NULL, event);
@@ -542,7 +544,7 @@ static struct amd_nb *amd_alloc_nb(int cpu)
/*
* initialize all possible NB constraints
*/
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
__set_bit(i, nb->event_constraints[i].idxmsk);
nb->event_constraints[i].weight = 1;
}
@@ -647,7 +649,7 @@ static void amd_pmu_cpu_dead(int cpu)
}
}
-static inline void amd_pmu_set_global_ctl(u64 ctl)
+static __always_inline void amd_pmu_set_global_ctl(u64 ctl)
{
wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl);
}
@@ -735,7 +737,7 @@ static void amd_pmu_check_overflow(void)
* counters are always enabled when this function is called and
* ARCH_PERFMON_EVENTSEL_INT is always set.
*/
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
if (!test_bit(idx, cpuc->active_mask))
continue;
@@ -755,7 +757,7 @@ static void amd_pmu_enable_all(int added)
amd_brs_enable_all();
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
/* only activate events which are marked as active */
if (!test_bit(idx, cpuc->active_mask))
continue;
@@ -907,6 +909,37 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
return amd_pmu_adjust_nmi_window(handled);
}
+/*
+ * AMD-specific callback invoked through perf_snapshot_branch_stack static
+ * call, defined in include/linux/perf_event.h. See its definition for API
+ * details. It's up to caller to provide enough space in *entries* to fit all
+ * LBR records, otherwise returned result will be truncated to *cnt* entries.
+ */
+static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+ struct cpu_hw_events *cpuc;
+ unsigned long flags;
+
+ /*
+ * The sequence of steps to freeze LBR should be completely inlined
+ * and contain no branches to minimize contamination of LBR snapshot
+ */
+ local_irq_save(flags);
+ amd_pmu_core_disable_all();
+ __amd_pmu_lbr_disable();
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ amd_pmu_lbr_read();
+ cnt = min(cnt, x86_pmu.lbr_nr);
+ memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
+
+ amd_pmu_v2_enable_all(0);
+ local_irq_restore(flags);
+
+ return cnt;
+}
+
static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -947,7 +980,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
/* Clear any reserved bits set by buggy microcode */
status &= amd_pmu_global_cntr_mask;
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
if (!test_bit(idx, cpuc->active_mask))
continue;
@@ -1282,7 +1315,7 @@ static __initconst const struct x86_pmu amd_pmu = {
.addr_offset = amd_pmu_addr_offset,
.event_map = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_counters = AMD64_NUM_COUNTERS,
+ .cntr_mask64 = GENMASK_ULL(AMD64_NUM_COUNTERS - 1, 0),
.add = amd_pmu_add_event,
.del = amd_pmu_del_event,
.cntval_bits = 48,
@@ -1381,7 +1414,7 @@ static int __init amd_core_pmu_init(void)
*/
x86_pmu.eventsel = MSR_F15H_PERF_CTL;
x86_pmu.perfctr = MSR_F15H_PERF_CTR;
- x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE;
+ x86_pmu.cntr_mask64 = GENMASK_ULL(AMD64_NUM_COUNTERS_CORE - 1, 0);
/* Check for Performance Monitoring v2 support */
if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) {
@@ -1391,9 +1424,9 @@ static int __init amd_core_pmu_init(void)
x86_pmu.version = 2;
/* Find the number of available Core PMCs */
- x86_pmu.num_counters = ebx.split.num_core_pmc;
+ x86_pmu.cntr_mask64 = GENMASK_ULL(ebx.split.num_core_pmc - 1, 0);
- amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1;
+ amd_pmu_global_cntr_mask = x86_pmu.cntr_mask64;
/* Update PMC handling functions */
x86_pmu.enable_all = amd_pmu_v2_enable_all;
@@ -1421,12 +1454,12 @@ static int __init amd_core_pmu_init(void)
* even numbered counter that has a consecutive adjacent odd
* numbered counter following it.
*/
- for (i = 0; i < x86_pmu.num_counters - 1; i += 2)
+ for (i = 0; i < x86_pmu_max_num_counters(NULL) - 1; i += 2)
even_ctr_mask |= BIT_ULL(i);
pair_constraint = (struct event_constraint)
__EVENT_CONSTRAINT(0, even_ctr_mask, 0,
- x86_pmu.num_counters / 2, 0,
+ x86_pmu_max_num_counters(NULL) / 2, 0,
PERF_X86_EVENT_PAIR);
x86_pmu.get_event_constraints = amd_get_event_constraints_f17h;
@@ -1443,6 +1476,10 @@ static int __init amd_core_pmu_init(void)
static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset);
static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add);
static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del);
+
+ /* Only support branch_stack snapshot on perfmon v2 */
+ if (x86_pmu.handle_irq == amd_pmu_v2_handle_irq)
+ static_call_update(perf_snapshot_branch_stack, amd_pmu_v2_snapshot_branch_stack);
} else if (!amd_brs_init()) {
/*
* BRS requires special event constraints and flushing on ctxsw.
diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
index 5149830c7c4f..19c7b76e21bc 100644
--- a/arch/x86/events/amd/lbr.c
+++ b/arch/x86/events/amd/lbr.c
@@ -310,10 +310,6 @@ int amd_pmu_lbr_hw_config(struct perf_event *event)
{
int ret = 0;
- /* LBR is not recommended in counting mode */
- if (!is_sampling_event(event))
- return -EINVAL;
-
ret = amd_pmu_lbr_setup_filter(event);
if (!ret)
event->attach_state |= PERF_ATTACH_SCHED_CB;
@@ -414,18 +410,11 @@ void amd_pmu_lbr_enable_all(void)
void amd_pmu_lbr_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- u64 dbg_ctl, dbg_extn_cfg;
if (!cpuc->lbr_users || !x86_pmu.lbr_nr)
return;
- rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
- wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
-
- if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) {
- rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- }
+ __amd_pmu_lbr_disable();
}
__init int amd_pmu_lbr_init(void)
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 4ccb8fa483e6..0bfde2ea5cb8 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -162,7 +162,9 @@ static int amd_uncore_add(struct perf_event *event, int flags)
/* if not, take the first available counter */
hwc->idx = -1;
for (i = 0; i < pmu->num_counters; i++) {
- if (cmpxchg(&ctx->events[i], NULL, event) == NULL) {
+ struct perf_event *tmp = NULL;
+
+ if (try_cmpxchg(&ctx->events[i], &tmp, event)) {
hwc->idx = i;
break;
}
@@ -196,7 +198,9 @@ static void amd_uncore_del(struct perf_event *event, int flags)
event->pmu->stop(event, PERF_EF_UPDATE);
for (i = 0; i < pmu->num_counters; i++) {
- if (cmpxchg(&ctx->events[i], event, NULL) == event)
+ struct perf_event *tmp = event;
+
+ if (try_cmpxchg(&ctx->events[i], &tmp, NULL))
break;
}
@@ -639,7 +643,7 @@ void amd_uncore_df_ctx_scan(struct amd_uncore *uncore, unsigned int cpu)
info.split.aux_data = 0;
info.split.num_pmcs = NUM_COUNTERS_NB;
info.split.gid = 0;
- info.split.cid = topology_die_id(cpu);
+ info.split.cid = topology_logical_package_id(cpu);
if (pmu_version >= 2) {
ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
@@ -654,17 +658,20 @@ int amd_uncore_df_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
{
struct attribute **df_attr = amd_uncore_df_format_attr;
struct amd_uncore_pmu *pmu;
+ int num_counters;
/* Run just once */
if (uncore->init_done)
return amd_uncore_ctx_init(uncore, cpu);
+ num_counters = amd_uncore_ctx_num_pmcs(uncore, cpu);
+ if (!num_counters)
+ goto done;
+
/* No grouping, single instance for a system */
uncore->pmus = kzalloc(sizeof(*uncore->pmus), GFP_KERNEL);
- if (!uncore->pmus) {
- uncore->num_pmus = 0;
+ if (!uncore->pmus)
goto done;
- }
/*
* For Family 17h and above, the Northbridge counters are repurposed
@@ -674,7 +681,7 @@ int amd_uncore_df_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
pmu = &uncore->pmus[0];
strscpy(pmu->name, boot_cpu_data.x86 >= 0x17 ? "amd_df" : "amd_nb",
sizeof(pmu->name));
- pmu->num_counters = amd_uncore_ctx_num_pmcs(uncore, cpu);
+ pmu->num_counters = num_counters;
pmu->msr_base = MSR_F15H_NB_PERF_CTL;
pmu->rdpmc_base = RDPMC_BASE_NB;
pmu->group = amd_uncore_ctx_gid(uncore, cpu);
@@ -785,17 +792,20 @@ int amd_uncore_l3_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
{
struct attribute **l3_attr = amd_uncore_l3_format_attr;
struct amd_uncore_pmu *pmu;
+ int num_counters;
/* Run just once */
if (uncore->init_done)
return amd_uncore_ctx_init(uncore, cpu);
+ num_counters = amd_uncore_ctx_num_pmcs(uncore, cpu);
+ if (!num_counters)
+ goto done;
+
/* No grouping, single instance for a system */
uncore->pmus = kzalloc(sizeof(*uncore->pmus), GFP_KERNEL);
- if (!uncore->pmus) {
- uncore->num_pmus = 0;
+ if (!uncore->pmus)
goto done;
- }
/*
* For Family 17h and above, L3 cache counters are available instead
@@ -805,7 +815,7 @@ int amd_uncore_l3_ctx_init(struct amd_uncore *uncore, unsigned int cpu)
pmu = &uncore->pmus[0];
strscpy(pmu->name, boot_cpu_data.x86 >= 0x17 ? "amd_l3" : "amd_l2",
sizeof(pmu->name));
- pmu->num_counters = amd_uncore_ctx_num_pmcs(uncore, cpu);
+ pmu->num_counters = num_counters;
pmu->msr_base = MSR_F16H_L2I_PERF_CTL;
pmu->rdpmc_base = RDPMC_BASE_LLC;
pmu->group = amd_uncore_ctx_gid(uncore, cpu);
@@ -893,8 +903,8 @@ void amd_uncore_umc_ctx_scan(struct amd_uncore *uncore, unsigned int cpu)
cpuid(EXT_PERFMON_DEBUG_FEATURES, &eax, &ebx.full, &ecx, &edx);
info.split.aux_data = ecx; /* stash active mask */
info.split.num_pmcs = ebx.split.num_umc_pmc;
- info.split.gid = topology_die_id(cpu);
- info.split.cid = topology_die_id(cpu);
+ info.split.gid = topology_logical_package_id(cpu);
+ info.split.cid = topology_logical_package_id(cpu);
*per_cpu_ptr(uncore->info, cpu) = info;
}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5b0dd07b1ef1..65ab6460aed4 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -41,6 +41,8 @@
#include <asm/desc.h>
#include <asm/ldt.h>
#include <asm/unwind.h>
+#include <asm/uprobes.h>
+#include <asm/ibt.h>
#include "perf_event.h"
@@ -189,29 +191,31 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
#ifdef CONFIG_X86_LOCAL_APIC
-static inline int get_possible_num_counters(void)
+static inline u64 get_possible_counter_mask(void)
{
- int i, num_counters = x86_pmu.num_counters;
+ u64 cntr_mask = x86_pmu.cntr_mask64;
+ int i;
if (!is_hybrid())
- return num_counters;
+ return cntr_mask;
for (i = 0; i < x86_pmu.num_hybrid_pmus; i++)
- num_counters = max_t(int, num_counters, x86_pmu.hybrid_pmu[i].num_counters);
+ cntr_mask |= x86_pmu.hybrid_pmu[i].cntr_mask64;
- return num_counters;
+ return cntr_mask;
}
static bool reserve_pmc_hardware(void)
{
- int i, num_counters = get_possible_num_counters();
+ u64 cntr_mask = get_possible_counter_mask();
+ int i, end;
- for (i = 0; i < num_counters; i++) {
+ for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
goto perfctr_fail;
}
- for (i = 0; i < num_counters; i++) {
+ for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
goto eventsel_fail;
}
@@ -219,13 +223,14 @@ static bool reserve_pmc_hardware(void)
return true;
eventsel_fail:
- for (i--; i >= 0; i--)
+ end = i;
+ for_each_set_bit(i, (unsigned long *)&cntr_mask, end)
release_evntsel_nmi(x86_pmu_config_addr(i));
-
- i = num_counters;
+ i = X86_PMC_IDX_MAX;
perfctr_fail:
- for (i--; i >= 0; i--)
+ end = i;
+ for_each_set_bit(i, (unsigned long *)&cntr_mask, end)
release_perfctr_nmi(x86_pmu_event_addr(i));
return false;
@@ -233,9 +238,10 @@ perfctr_fail:
static void release_pmc_hardware(void)
{
- int i, num_counters = get_possible_num_counters();
+ u64 cntr_mask = get_possible_counter_mask();
+ int i;
- for (i = 0; i < num_counters; i++) {
+ for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
release_perfctr_nmi(x86_pmu_event_addr(i));
release_evntsel_nmi(x86_pmu_config_addr(i));
}
@@ -248,7 +254,8 @@ static void release_pmc_hardware(void) {}
#endif
-bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed)
+bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask,
+ unsigned long *fixed_cntr_mask)
{
u64 val, val_fail = -1, val_new= ~0;
int i, reg, reg_fail = -1, ret = 0;
@@ -259,7 +266,7 @@ bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed)
* Check to see if the BIOS enabled any of the counters, if so
* complain and bail.
*/
- for (i = 0; i < num_counters; i++) {
+ for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) {
reg = x86_pmu_config_addr(i);
ret = rdmsrl_safe(reg, &val);
if (ret)
@@ -273,12 +280,12 @@ bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed)
}
}
- if (num_counters_fixed) {
+ if (*(u64 *)fixed_cntr_mask) {
reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
ret = rdmsrl_safe(reg, &val);
if (ret)
goto msr_fail;
- for (i = 0; i < num_counters_fixed; i++) {
+ for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) {
if (fixed_counter_disabled(i, pmu))
continue;
if (val & (0x03ULL << i*4)) {
@@ -619,7 +626,7 @@ int x86_pmu_hw_config(struct perf_event *event)
event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
if (event->attr.type == event->pmu->type)
- event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+ event->hw.config |= x86_pmu_get_event_config(event);
if (event->attr.sample_period && x86_pmu.limit_period) {
s64 left = event->attr.sample_period;
@@ -679,7 +686,7 @@ void x86_pmu_disable_all(void)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
u64 val;
@@ -736,7 +743,7 @@ void x86_pmu_enable_all(int added)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
if (!test_bit(idx, cpuc->active_mask))
@@ -975,7 +982,6 @@ EXPORT_SYMBOL_GPL(perf_assign_events);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
- int num_counters = hybrid(cpuc->pmu, num_counters);
struct event_constraint *c;
struct perf_event *e;
int n0, i, wmin, wmax, unsched = 0;
@@ -1051,7 +1057,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
/* slow path */
if (i != n) {
- int gpmax = num_counters;
+ int gpmax = x86_pmu_max_num_counters(cpuc->pmu);
/*
* Do not allow scheduling of more than half the available
@@ -1072,7 +1078,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
* the extra Merge events needed by large increment events.
*/
if (x86_pmu.flags & PMU_FL_PAIR) {
- gpmax = num_counters - cpuc->n_pair;
+ gpmax -= cpuc->n_pair;
WARN_ON(gpmax <= 0);
}
@@ -1157,12 +1163,10 @@ static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
*/
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
- int num_counters = hybrid(cpuc->pmu, num_counters);
- int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
struct perf_event *event;
int n, max_count;
- max_count = num_counters + num_counters_fixed;
+ max_count = x86_pmu_num_counters(cpuc->pmu) + x86_pmu_num_counters_fixed(cpuc->pmu);
/* current number of events already accepted */
n = cpuc->n_events;
@@ -1234,8 +1238,7 @@ static inline void x86_assign_hw_event(struct perf_event *event,
fallthrough;
case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1:
hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
- hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 +
- (idx - INTEL_PMC_IDX_FIXED);
+ hwc->event_base = x86_pmu_fixed_ctr_addr(idx - INTEL_PMC_IDX_FIXED);
hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) |
INTEL_PMC_FIXED_RDPMC_BASE;
break;
@@ -1519,19 +1522,22 @@ static void x86_pmu_start(struct perf_event *event, int flags)
void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+ unsigned long *cntr_mask, *fixed_cntr_mask;
+ struct event_constraint *pebs_constraints;
+ struct cpu_hw_events *cpuc;
u64 pebs, debugctl;
- int cpu = smp_processor_id();
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- int num_counters = hybrid(cpuc->pmu, num_counters);
- int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
- struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
- unsigned long flags;
- int idx;
+ int cpu, idx;
- if (!num_counters)
- return;
+ guard(irqsave)();
- local_irq_save(flags);
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_events, cpu);
+ cntr_mask = hybrid(cpuc->pmu, cntr_mask);
+ fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask);
+ pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
+
+ if (!*(u64 *)cntr_mask)
+ return;
if (x86_pmu.version >= 2) {
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
@@ -1555,7 +1561,7 @@ void perf_event_print_debug(void)
}
pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
- for (idx = 0; idx < num_counters; idx++) {
+ for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) {
rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
rdmsrl(x86_pmu_event_addr(idx), pmc_count);
@@ -1568,15 +1574,14 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
cpu, idx, prev_left);
}
- for (idx = 0; idx < num_counters_fixed; idx++) {
+ for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) {
if (fixed_counter_disabled(idx, cpuc->pmu))
continue;
- rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+ rdmsrl(x86_pmu_fixed_ctr_addr(idx), pmc_count);
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
cpu, idx, pmc_count);
}
- local_irq_restore(flags);
}
void x86_pmu_stop(struct perf_event *event, int flags)
@@ -1682,7 +1687,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
*/
apic_write(APIC_LVTPC, APIC_DM_NMI);
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
if (!test_bit(idx, cpuc->active_mask))
continue;
@@ -2038,18 +2043,15 @@ static void _x86_pmu_read(struct perf_event *event)
static_call(x86_pmu_update)(event);
}
-void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
- u64 intel_ctrl)
+void x86_pmu_show_pmu_cap(struct pmu *pmu)
{
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
- pr_info("... generic registers: %d\n", num_counters);
+ pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu));
pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose events: %lu\n",
- hweight64((((1ULL << num_counters_fixed) - 1)
- << INTEL_PMC_IDX_FIXED) & intel_ctrl));
- pr_info("... event mask: %016Lx\n", intel_ctrl);
+ pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu));
+ pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl));
}
static int __init init_hw_perf_events(void)
@@ -2086,7 +2088,7 @@ static int __init init_hw_perf_events(void)
pmu_check_apic();
/* sanity check that the hardware exists or is emulated */
- if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed))
+ if (!check_hw_exists(&pmu, x86_pmu.cntr_mask, x86_pmu.fixed_cntr_mask))
goto out_bad_pmu;
pr_cont("%s PMU driver.\n", x86_pmu.name);
@@ -2097,14 +2099,17 @@ static int __init init_hw_perf_events(void)
quirk->func();
if (!x86_pmu.intel_ctrl)
- x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+ x86_pmu.intel_ctrl = x86_pmu.cntr_mask64;
+
+ if (!x86_pmu.config_mask)
+ x86_pmu.config_mask = X86_RAW_EVENT_MASK;
perf_events_lapic_init();
register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
unconstrained = (struct event_constraint)
- __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
- 0, x86_pmu.num_counters, 0, 0);
+ __EVENT_CONSTRAINT(0, x86_pmu.cntr_mask64,
+ 0, x86_pmu_num_counters(NULL), 0, 0);
x86_pmu_format_group.attrs = x86_pmu.format_attrs;
@@ -2113,11 +2118,8 @@ static int __init init_hw_perf_events(void)
pmu.attr_update = x86_pmu.attr_update;
- if (!is_hybrid()) {
- x86_pmu_show_pmu_cap(x86_pmu.num_counters,
- x86_pmu.num_counters_fixed,
- x86_pmu.intel_ctrl);
- }
+ if (!is_hybrid())
+ x86_pmu_show_pmu_cap(NULL);
if (!x86_pmu.read)
x86_pmu.read = _x86_pmu_read;
@@ -2481,10 +2483,10 @@ void perf_clear_dirty_counters(void)
for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) {
if (i >= INTEL_PMC_IDX_FIXED) {
/* Metrics and fake events don't have corresponding HW counters. */
- if ((i - INTEL_PMC_IDX_FIXED) >= hybrid(cpuc->pmu, num_counters_fixed))
+ if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask)))
continue;
- wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0);
+ wrmsrl(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0);
} else {
wrmsrl(x86_pmu_event_addr(i), 0);
}
@@ -2547,6 +2549,7 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
struct device_attribute *attr,
const char *buf, size_t count)
{
+ static DEFINE_MUTEX(rdpmc_mutex);
unsigned long val;
ssize_t ret;
@@ -2560,6 +2563,8 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
if (x86_pmu.attr_rdpmc_broken)
return -ENOTSUPP;
+ guard(mutex)(&rdpmc_mutex);
+
if (val != x86_pmu.attr_rdpmc) {
/*
* Changing into or out of never available or always available,
@@ -2813,6 +2818,46 @@ static unsigned long get_segment_base(unsigned int segment)
return get_desc_base(desc);
}
+#ifdef CONFIG_UPROBES
+/*
+ * Heuristic-based check if uprobe is installed at the function entry.
+ *
+ * Under assumption of user code being compiled with frame pointers,
+ * `push %rbp/%ebp` is a good indicator that we indeed are.
+ *
+ * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
+ * If we get this wrong, captured stack trace might have one extra bogus
+ * entry, but the rest of stack trace will still be meaningful.
+ */
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ struct arch_uprobe *auprobe;
+
+ if (!current->utask)
+ return false;
+
+ auprobe = current->utask->auprobe;
+ if (!auprobe)
+ return false;
+
+ /* push %rbp/%ebp */
+ if (auprobe->insn[0] == 0x55)
+ return true;
+
+ /* endbr64 (64-bit only) */
+ if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn))
+ return true;
+
+ return false;
+}
+
+#else
+static bool is_uprobe_at_func_entry(struct pt_regs *regs)
+{
+ return false;
+}
+#endif /* CONFIG_UPROBES */
+
#ifdef CONFIG_IA32_EMULATION
#include <linux/compat.h>
@@ -2824,6 +2869,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
unsigned long ss_base, cs_base;
struct stack_frame_ia32 frame;
const struct stack_frame_ia32 __user *fp;
+ u32 ret_addr;
if (user_64bit_mode(regs))
return 0;
@@ -2833,6 +2879,12 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
+
+ /* see perf_callchain_user() below for why we do this */
+ if (is_uprobe_at_func_entry(regs) &&
+ !get_user(ret_addr, (const u32 __user *)regs->sp))
+ perf_callchain_store(entry, ret_addr);
+
while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
@@ -2861,6 +2913,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
{
struct stack_frame frame;
const struct stack_frame __user *fp;
+ unsigned long ret_addr;
if (perf_guest_state()) {
/* TODO: We don't support guest os callchain now */
@@ -2884,6 +2937,19 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
return;
pagefault_disable();
+
+ /*
+ * If we are called from uprobe handler, and we are indeed at the very
+ * entry to user function (which is normally a `push %rbp` instruction,
+ * under assumption of application being compiled with frame pointers),
+ * we should read return address from *regs->sp before proceeding
+ * to follow frame pointers, otherwise we'll skip immediate caller
+ * as %rbp is not yet setup.
+ */
+ if (is_uprobe_at_func_entry(regs) &&
+ !get_user(ret_addr, (const unsigned long __user *)regs->sp))
+ perf_callchain_store(entry, ret_addr);
+
while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
@@ -2983,8 +3049,8 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
* base PMU holds the correct number of counters for P-cores.
*/
cap->version = x86_pmu.version;
- cap->num_counters_gp = x86_pmu.num_counters;
- cap->num_counters_fixed = x86_pmu.num_counters_fixed;
+ cap->num_counters_gp = x86_pmu_num_counters(NULL);
+ cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL);
cap->bit_width_gp = x86_pmu.cntval_bits;
cap->bit_width_fixed = x86_pmu.cntval_bits;
cap->events_mask = (unsigned int)x86_pmu.events_maskl;
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 974e917e65b2..8f78b0c900ef 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -557,9 +557,6 @@ static int bts_event_init(struct perf_event *event)
* disabled, so disallow intel_bts driver for unprivileged
* users on paranoid systems since it provides trace data
* to the user in a zero-copy fashion.
- *
- * Note that the default paranoia setting permits unprivileged
- * users to profile the kernel.
*/
if (event->attr.exclude_kernel) {
ret = perf_allow_kernel(&event->attr);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 768d1414897f..d879478db3f5 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -220,6 +220,17 @@ static struct event_constraint intel_grt_event_constraints[] __read_mostly = {
EVENT_CONSTRAINT_END
};
+static struct event_constraint intel_skt_event_constraints[] __read_mostly = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */
+ FIXED_EVENT_CONSTRAINT(0x0073, 4), /* TOPDOWN_BAD_SPECULATION.ALL */
+ FIXED_EVENT_CONSTRAINT(0x019c, 5), /* TOPDOWN_FE_BOUND.ALL */
+ FIXED_EVENT_CONSTRAINT(0x02c2, 6), /* TOPDOWN_RETIRING.ALL */
+ EVENT_CONSTRAINT_END
+};
+
static struct event_constraint intel_skl_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -370,6 +381,55 @@ static struct extra_reg intel_rwc_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct event_constraint intel_lnc_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */
+ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7),
+
+ INTEL_UEVENT_CONSTRAINT(0x0148, 0x4),
+ INTEL_UEVENT_CONSTRAINT(0x0175, 0x4),
+
+ INTEL_EVENT_CONSTRAINT(0x2e, 0x3ff),
+ INTEL_EVENT_CONSTRAINT(0x3c, 0x3ff),
+ /*
+ * Generally event codes < 0x90 are restricted to counters 0-3.
+ * The 0x2E and 0x3C are exception, which has no restriction.
+ */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf),
+
+ INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
+ INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
+ INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x10a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x01b1, 0x8),
+ INTEL_UEVENT_CONSTRAINT(0x02cd, 0x3),
+ INTEL_EVENT_CONSTRAINT(0xce, 0x1),
+
+ INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf),
+ /*
+ * Generally event codes >= 0x90 are likely to have no restrictions.
+ * The exception are defined as above.
+ */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0x3ff),
+
+ EVENT_CONSTRAINT_END
+};
+
+
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
@@ -2874,26 +2934,26 @@ static void intel_pmu_reset(void)
{
struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
- int num_counters = hybrid(cpuc->pmu, num_counters);
+ unsigned long *cntr_mask = hybrid(cpuc->pmu, cntr_mask);
+ unsigned long *fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask);
unsigned long flags;
int idx;
- if (!num_counters)
+ if (!*(u64 *)cntr_mask)
return;
local_irq_save(flags);
pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
- for (idx = 0; idx < num_counters; idx++) {
+ for_each_set_bit(idx, cntr_mask, INTEL_PMC_MAX_GENERIC) {
wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
}
- for (idx = 0; idx < num_counters_fixed; idx++) {
+ for_each_set_bit(idx, fixed_cntr_mask, INTEL_PMC_MAX_FIXED) {
if (fixed_counter_disabled(idx, cpuc->pmu))
continue;
- wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+ wrmsrl_safe(x86_pmu_fixed_ctr_addr(idx), 0ull);
}
if (ds)
@@ -2940,8 +3000,7 @@ static void x86_pmu_handle_guest_pebs(struct pt_regs *regs,
!guest_pebs_idxs)
return;
- for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs,
- INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed) {
+ for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs, X86_PMC_IDX_MAX) {
event = cpuc->events[bit];
if (!event->attr.precise_ip)
continue;
@@ -3913,8 +3972,12 @@ static int intel_pmu_hw_config(struct perf_event *event)
x86_pmu.pebs_aliases(event);
}
- if (needs_branch_stack(event) && is_sampling_event(event))
- event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK;
+ if (needs_branch_stack(event)) {
+ /* Avoid branch stack setup for counting events in SAMPLE READ */
+ if (is_sampling_event(event) ||
+ !(event->attr.sample_type & PERF_SAMPLE_READ))
+ event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK;
+ }
if (branch_sample_counters(event)) {
struct perf_event *leader, *sibling;
@@ -4199,7 +4262,7 @@ static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data)
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
int idx;
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[idx];
arr[idx].msr = x86_pmu_config_addr(idx);
@@ -4217,7 +4280,7 @@ static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data)
arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
}
- *nr = x86_pmu.num_counters;
+ *nr = x86_pmu_max_num_counters(cpuc->pmu);
return arr;
}
@@ -4232,7 +4295,7 @@ static void core_pmu_enable_all(int added)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
if (!test_bit(idx, cpuc->active_mask) ||
@@ -4530,6 +4593,25 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
return HYBRID_INTEL_CORE;
}
+static inline bool erratum_hsw11(struct perf_event *event)
+{
+ return (event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01);
+}
+
+/*
+ * The HSW11 requires a period larger than 100 which is the same as the BDM11.
+ * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL.
+ *
+ * The message 'interrupt took too long' can be observed on any counter which
+ * was armed with a period < 32 and two events expired in the same NMI.
+ * A minimum period of 32 is enforced for the rest of the events.
+ */
+static void hsw_limit_period(struct perf_event *event, s64 *left)
+{
+ *left = max(*left, erratum_hsw11(event) ? 128 : 32);
+}
+
/*
* Broadwell:
*
@@ -4547,8 +4629,7 @@ static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void)
*/
static void bdw_limit_period(struct perf_event *event, s64 *left)
{
- if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
- X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (erratum_hsw11(event)) {
if (*left < 128)
*left = 128;
*left &= ~0x3fULL;
@@ -4573,8 +4654,55 @@ PMU_FORMAT_ATTR(pc, "config:19" );
PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */
PMU_FORMAT_ATTR(inv, "config:23" );
PMU_FORMAT_ATTR(cmask, "config:24-31" );
-PMU_FORMAT_ATTR(in_tx, "config:32");
-PMU_FORMAT_ATTR(in_tx_cp, "config:33");
+PMU_FORMAT_ATTR(in_tx, "config:32" );
+PMU_FORMAT_ATTR(in_tx_cp, "config:33" );
+PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */
+
+static ssize_t umask2_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ u64 mask = hybrid(dev_get_drvdata(dev), config_mask) & ARCH_PERFMON_EVENTSEL_UMASK2;
+
+ if (mask == ARCH_PERFMON_EVENTSEL_UMASK2)
+ return sprintf(page, "config:8-15,40-47\n");
+
+ /* Roll back to the old format if umask2 is not supported. */
+ return sprintf(page, "config:8-15\n");
+}
+
+static struct device_attribute format_attr_umask2 =
+ __ATTR(umask, 0444, umask2_show, NULL);
+
+static struct attribute *format_evtsel_ext_attrs[] = {
+ &format_attr_umask2.attr,
+ &format_attr_eq.attr,
+ NULL
+};
+
+static umode_t
+evtsel_ext_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ u64 mask;
+
+ /*
+ * The umask and umask2 have different formats but share the
+ * same attr name. In update mode, the previous value of the
+ * umask is unconditionally removed before is_visible. If
+ * umask2 format is not enumerated, it's impossible to roll
+ * back to the old format.
+ * Does the check in umask2_show rather than is_visible.
+ */
+ if (i == 0)
+ return attr->mode;
+
+ mask = hybrid(dev_get_drvdata(dev), config_mask);
+ if (i == 1)
+ return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0;
+
+ return 0;
+}
static struct attribute *intel_arch_formats_attr[] = {
&format_attr_event.attr,
@@ -4684,13 +4812,33 @@ static void flip_smm_bit(void *data)
}
}
-static void intel_pmu_check_num_counters(int *num_counters,
- int *num_counters_fixed,
- u64 *intel_ctrl, u64 fixed_mask);
+static void intel_pmu_check_counters_mask(u64 *cntr_mask,
+ u64 *fixed_cntr_mask,
+ u64 *intel_ctrl)
+{
+ unsigned int bit;
+
+ bit = fls64(*cntr_mask);
+ if (bit > INTEL_PMC_MAX_GENERIC) {
+ WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+ bit, INTEL_PMC_MAX_GENERIC);
+ *cntr_mask &= GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0);
+ }
+ *intel_ctrl = *cntr_mask;
+
+ bit = fls64(*fixed_cntr_mask);
+ if (bit > INTEL_PMC_MAX_FIXED) {
+ WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+ bit, INTEL_PMC_MAX_FIXED);
+ *fixed_cntr_mask &= GENMASK_ULL(INTEL_PMC_MAX_FIXED - 1, 0);
+ }
+
+ *intel_ctrl |= *fixed_cntr_mask << INTEL_PMC_IDX_FIXED;
+}
static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints,
- int num_counters,
- int num_counters_fixed,
+ u64 cntr_mask,
+ u64 fixed_cntr_mask,
u64 intel_ctrl);
static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs);
@@ -4698,8 +4846,8 @@ static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs);
static inline bool intel_pmu_broken_perf_cap(void)
{
/* The Perf Metric (Bit 15) is always cleared */
- if ((boot_cpu_data.x86_model == INTEL_FAM6_METEORLAKE) ||
- (boot_cpu_data.x86_model == INTEL_FAM6_METEORLAKE_L))
+ if (boot_cpu_data.x86_vfm == INTEL_METEORLAKE ||
+ boot_cpu_data.x86_vfm == INTEL_METEORLAKE_L)
return true;
return false;
@@ -4707,17 +4855,22 @@ static inline bool intel_pmu_broken_perf_cap(void)
static void update_pmu_cap(struct x86_hybrid_pmu *pmu)
{
- unsigned int sub_bitmaps = cpuid_eax(ARCH_PERFMON_EXT_LEAF);
- unsigned int eax, ebx, ecx, edx;
+ unsigned int sub_bitmaps, eax, ebx, ecx, edx;
+
+ cpuid(ARCH_PERFMON_EXT_LEAF, &sub_bitmaps, &ebx, &ecx, &edx);
+
+ if (ebx & ARCH_PERFMON_EXT_UMASK2)
+ pmu->config_mask |= ARCH_PERFMON_EVENTSEL_UMASK2;
+ if (ebx & ARCH_PERFMON_EXT_EQ)
+ pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ;
if (sub_bitmaps & ARCH_PERFMON_NUM_COUNTER_LEAF_BIT) {
cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF,
&eax, &ebx, &ecx, &edx);
- pmu->num_counters = fls(eax);
- pmu->num_counters_fixed = fls(ebx);
+ pmu->cntr_mask64 = eax;
+ pmu->fixed_cntr_mask64 = ebx;
}
-
if (!intel_pmu_broken_perf_cap()) {
/* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */
rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities);
@@ -4726,12 +4879,12 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu)
static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
{
- intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed,
- &pmu->intel_ctrl, (1ULL << pmu->num_counters_fixed) - 1);
- pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
+ intel_pmu_check_counters_mask(&pmu->cntr_mask64, &pmu->fixed_cntr_mask64,
+ &pmu->intel_ctrl);
+ pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64);
pmu->unconstrained = (struct event_constraint)
- __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
- 0, pmu->num_counters, 0, 0);
+ __EVENT_CONSTRAINT(0, pmu->cntr_mask64,
+ 0, x86_pmu_num_counters(&pmu->pmu), 0, 0);
if (pmu->intel_cap.perf_metrics)
pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
@@ -4744,8 +4897,8 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
pmu->pmu.capabilities &= ~PERF_PMU_CAP_AUX_OUTPUT;
intel_pmu_check_event_constraints(pmu->event_constraints,
- pmu->num_counters,
- pmu->num_counters_fixed,
+ pmu->cntr_mask64,
+ pmu->fixed_cntr_mask64,
pmu->intel_ctrl);
intel_pmu_check_extra_regs(pmu->extra_regs);
@@ -4806,7 +4959,7 @@ static bool init_hybrid_pmu(int cpu)
intel_pmu_check_hybrid_pmus(pmu);
- if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed))
+ if (!check_hw_exists(&pmu->pmu, pmu->cntr_mask, pmu->fixed_cntr_mask))
return false;
pr_info("%s PMU driver: ", pmu->name);
@@ -4816,8 +4969,7 @@ static bool init_hybrid_pmu(int cpu)
pr_cont("\n");
- x86_pmu_show_pmu_cap(pmu->num_counters, pmu->num_counters_fixed,
- pmu->intel_ctrl);
+ x86_pmu_show_pmu_cap(&pmu->pmu);
end:
cpumask_set_cpu(cpu, &pmu->supported_cpus);
@@ -5058,6 +5210,7 @@ static __initconst const struct x86_pmu core_pmu = {
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .fixedctr = MSR_ARCH_PERFMON_FIXED_CTR0,
.event_map = intel_pmu_event_map,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
@@ -5111,6 +5264,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .fixedctr = MSR_ARCH_PERFMON_FIXED_CTR0,
.event_map = intel_pmu_event_map,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
@@ -5187,35 +5341,35 @@ static __init void intel_clovertown_quirk(void)
}
static const struct x86_cpu_desc isolation_ucodes[] = {
- INTEL_CPU_DESC(INTEL_FAM6_HASWELL, 3, 0x0000001f),
- INTEL_CPU_DESC(INTEL_FAM6_HASWELL_L, 1, 0x0000001e),
- INTEL_CPU_DESC(INTEL_FAM6_HASWELL_G, 1, 0x00000015),
- INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 2, 0x00000037),
- INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 4, 0x0000000a),
- INTEL_CPU_DESC(INTEL_FAM6_BROADWELL, 4, 0x00000023),
- INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_G, 1, 0x00000014),
- INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 2, 0x00000010),
- INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009),
- INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009),
- INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002),
- INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 1, 0x0b000014),
- INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021),
- INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000),
- INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000),
- INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 6, 0x00000000),
- INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 7, 0x00000000),
- INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 11, 0x00000000),
- INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c),
- INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c),
- INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e),
- INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 9, 0x0000004e),
- INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 10, 0x0000004e),
- INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 11, 0x0000004e),
- INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 12, 0x0000004e),
- INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 10, 0x0000004e),
- INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 11, 0x0000004e),
- INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 12, 0x0000004e),
- INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 13, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_HASWELL, 3, 0x0000001f),
+ INTEL_CPU_DESC(INTEL_HASWELL_L, 1, 0x0000001e),
+ INTEL_CPU_DESC(INTEL_HASWELL_G, 1, 0x00000015),
+ INTEL_CPU_DESC(INTEL_HASWELL_X, 2, 0x00000037),
+ INTEL_CPU_DESC(INTEL_HASWELL_X, 4, 0x0000000a),
+ INTEL_CPU_DESC(INTEL_BROADWELL, 4, 0x00000023),
+ INTEL_CPU_DESC(INTEL_BROADWELL_G, 1, 0x00000014),
+ INTEL_CPU_DESC(INTEL_BROADWELL_D, 2, 0x00000010),
+ INTEL_CPU_DESC(INTEL_BROADWELL_D, 3, 0x07000009),
+ INTEL_CPU_DESC(INTEL_BROADWELL_D, 4, 0x0f000009),
+ INTEL_CPU_DESC(INTEL_BROADWELL_D, 5, 0x0e000002),
+ INTEL_CPU_DESC(INTEL_BROADWELL_X, 1, 0x0b000014),
+ INTEL_CPU_DESC(INTEL_SKYLAKE_X, 3, 0x00000021),
+ INTEL_CPU_DESC(INTEL_SKYLAKE_X, 4, 0x00000000),
+ INTEL_CPU_DESC(INTEL_SKYLAKE_X, 5, 0x00000000),
+ INTEL_CPU_DESC(INTEL_SKYLAKE_X, 6, 0x00000000),
+ INTEL_CPU_DESC(INTEL_SKYLAKE_X, 7, 0x00000000),
+ INTEL_CPU_DESC(INTEL_SKYLAKE_X, 11, 0x00000000),
+ INTEL_CPU_DESC(INTEL_SKYLAKE_L, 3, 0x0000007c),
+ INTEL_CPU_DESC(INTEL_SKYLAKE, 3, 0x0000007c),
+ INTEL_CPU_DESC(INTEL_KABYLAKE, 9, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_KABYLAKE_L, 9, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_KABYLAKE_L, 10, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_KABYLAKE_L, 11, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_KABYLAKE_L, 12, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_KABYLAKE, 10, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_KABYLAKE, 11, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_KABYLAKE, 12, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_KABYLAKE, 13, 0x0000004e),
{}
};
@@ -5232,9 +5386,9 @@ static __init void intel_pebs_isolation_quirk(void)
}
static const struct x86_cpu_desc pebs_ucodes[] = {
- INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE, 7, 0x00000028),
- INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE_X, 6, 0x00000618),
- INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE_X, 7, 0x0000070c),
+ INTEL_CPU_DESC(INTEL_SANDYBRIDGE, 7, 0x00000028),
+ INTEL_CPU_DESC(INTEL_SANDYBRIDGE_X, 6, 0x00000618),
+ INTEL_CPU_DESC(INTEL_SANDYBRIDGE_X, 7, 0x0000070c),
{}
};
@@ -5645,18 +5799,11 @@ lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i)
static char pmu_name_str[30];
-static ssize_t pmu_name_show(struct device *cdev,
- struct device_attribute *attr,
- char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str);
-}
-
-static DEVICE_ATTR_RO(pmu_name);
+static DEVICE_STRING_ATTR_RO(pmu_name, 0444, pmu_name_str);
static struct attribute *intel_pmu_caps_attrs[] = {
- &dev_attr_pmu_name.attr,
- NULL
+ &dev_attr_pmu_name.attr.attr,
+ NULL
};
static DEVICE_ATTR(allow_tsx_force_abort, 0644,
@@ -5705,8 +5852,22 @@ exra_is_visible(struct kobject *kobj, struct attribute *attr, int i)
return x86_pmu.version >= 2 ? attr->mode : 0;
}
+static umode_t
+td_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ /*
+ * Hide the perf metrics topdown events
+ * if the feature is not enumerated.
+ */
+ if (x86_pmu.num_topdown_events)
+ return x86_pmu.intel_cap.perf_metrics ? attr->mode : 0;
+
+ return attr->mode;
+}
+
static struct attribute_group group_events_td = {
.name = "events",
+ .is_visible = td_is_visible,
};
static struct attribute_group group_events_mem = {
@@ -5740,6 +5901,12 @@ static struct attribute_group group_format_extra_skl = {
.is_visible = exra_is_visible,
};
+static struct attribute_group group_format_evtsel_ext = {
+ .name = "format",
+ .attrs = format_evtsel_ext_attrs,
+ .is_visible = evtsel_ext_is_visible,
+};
+
static struct attribute_group group_default = {
.attrs = intel_pmu_attrs,
.is_visible = default_is_visible,
@@ -5753,6 +5920,7 @@ static const struct attribute_group *attr_update[] = {
&group_caps_lbr,
&group_format_extra,
&group_format_extra_skl,
+ &group_format_evtsel_ext,
&group_default,
NULL,
};
@@ -5780,6 +5948,23 @@ static struct attribute *adl_hybrid_events_attrs[] = {
NULL,
};
+EVENT_ATTR_STR_HYBRID(topdown-retiring, td_retiring_lnl, "event=0xc2,umask=0x02;event=0x00,umask=0x80", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-fe-bound, td_fe_bound_lnl, "event=0x9c,umask=0x01;event=0x00,umask=0x82", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-be-bound, td_be_bound_lnl, "event=0xa4,umask=0x02;event=0x00,umask=0x83", hybrid_big_small);
+
+static struct attribute *lnl_hybrid_events_attrs[] = {
+ EVENT_PTR(slots_adl),
+ EVENT_PTR(td_retiring_lnl),
+ EVENT_PTR(td_bad_spec_adl),
+ EVENT_PTR(td_fe_bound_lnl),
+ EVENT_PTR(td_be_bound_lnl),
+ EVENT_PTR(td_heavy_ops_adl),
+ EVENT_PTR(td_br_mis_adl),
+ EVENT_PTR(td_fetch_lat_adl),
+ EVENT_PTR(td_mem_bound_adl),
+ NULL
+};
+
/* Must be in IDX order */
EVENT_ATTR_STR_HYBRID(mem-loads, mem_ld_adl, "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3", hybrid_big_small);
EVENT_ATTR_STR_HYBRID(mem-stores, mem_st_adl, "event=0xd0,umask=0x6;event=0xcd,umask=0x2", hybrid_big_small);
@@ -5908,9 +6093,27 @@ static umode_t hybrid_format_is_visible(struct kobject *kobj,
return (cpu >= 0) && (pmu->pmu_type & pmu_attr->pmu_type) ? attr->mode : 0;
}
+static umode_t hybrid_td_is_visible(struct kobject *kobj,
+ struct attribute *attr, int i)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+ if (!is_attr_for_this_pmu(kobj, attr))
+ return 0;
+
+
+ /* Only the big core supports perf metrics */
+ if (pmu->pmu_type == hybrid_big)
+ return pmu->intel_cap.perf_metrics ? attr->mode : 0;
+
+ return attr->mode;
+}
+
static struct attribute_group hybrid_group_events_td = {
.name = "events",
- .is_visible = hybrid_events_is_visible,
+ .is_visible = hybrid_td_is_visible,
};
static struct attribute_group hybrid_group_events_mem = {
@@ -5955,6 +6158,7 @@ static const struct attribute_group *hybrid_attr_update[] = {
&group_caps_gen,
&group_caps_lbr,
&hybrid_group_format_extra,
+ &group_format_evtsel_ext,
&group_default,
&hybrid_group_cpus,
NULL,
@@ -5962,29 +6166,9 @@ static const struct attribute_group *hybrid_attr_update[] = {
static struct attribute *empty_attrs;
-static void intel_pmu_check_num_counters(int *num_counters,
- int *num_counters_fixed,
- u64 *intel_ctrl, u64 fixed_mask)
-{
- if (*num_counters > INTEL_PMC_MAX_GENERIC) {
- WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
- *num_counters, INTEL_PMC_MAX_GENERIC);
- *num_counters = INTEL_PMC_MAX_GENERIC;
- }
- *intel_ctrl = (1ULL << *num_counters) - 1;
-
- if (*num_counters_fixed > INTEL_PMC_MAX_FIXED) {
- WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
- *num_counters_fixed, INTEL_PMC_MAX_FIXED);
- *num_counters_fixed = INTEL_PMC_MAX_FIXED;
- }
-
- *intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED;
-}
-
static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints,
- int num_counters,
- int num_counters_fixed,
+ u64 cntr_mask,
+ u64 fixed_cntr_mask,
u64 intel_ctrl)
{
struct event_constraint *c;
@@ -6021,10 +6205,9 @@ static void intel_pmu_check_event_constraints(struct event_constraint *event_con
* generic counters
*/
if (!use_fixed_pseudo_encoding(c->code))
- c->idxmsk64 |= (1ULL << num_counters) - 1;
+ c->idxmsk64 |= cntr_mask;
}
- c->idxmsk64 &=
- ~(~0ULL << (INTEL_PMC_IDX_FIXED + num_counters_fixed));
+ c->idxmsk64 &= cntr_mask | (fixed_cntr_mask << INTEL_PMC_IDX_FIXED);
c->weight = hweight64(c->idxmsk64);
}
}
@@ -6049,6 +6232,11 @@ static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs)
}
}
+static inline int intel_pmu_v6_addr_offset(int index, bool eventsel)
+{
+ return MSR_IA32_PMC_V6_STEP * index;
+}
+
static const struct { enum hybrid_pmu_type id; char *name; } intel_hybrid_pmu_type_map[] __initconst = {
{ hybrid_small, "cpu_atom" },
{ hybrid_big, "cpu_core" },
@@ -6075,12 +6263,13 @@ static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus)
pmu->pmu_type = intel_hybrid_pmu_type_map[bit].id;
pmu->name = intel_hybrid_pmu_type_map[bit].name;
- pmu->num_counters = x86_pmu.num_counters;
- pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
- pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
+ pmu->cntr_mask64 = x86_pmu.cntr_mask64;
+ pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64;
+ pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64);
+ pmu->config_mask = X86_RAW_EVENT_MASK;
pmu->unconstrained = (struct event_constraint)
- __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
- 0, pmu->num_counters, 0, 0);
+ __EVENT_CONSTRAINT(0, pmu->cntr_mask64,
+ 0, x86_pmu_num_counters(&pmu->pmu), 0, 0);
pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
if (pmu->pmu_type & hybrid_small) {
@@ -6150,6 +6339,21 @@ static __always_inline void intel_pmu_init_grt(struct pmu *pmu)
intel_pmu_ref_cycles_ext();
}
+static __always_inline void intel_pmu_init_lnc(struct pmu *pmu)
+{
+ intel_pmu_init_glc(pmu);
+ hybrid(pmu, event_constraints) = intel_lnc_event_constraints;
+ hybrid(pmu, pebs_constraints) = intel_lnc_pebs_event_constraints;
+ hybrid(pmu, extra_regs) = intel_rwc_extra_regs;
+}
+
+static __always_inline void intel_pmu_init_skt(struct pmu *pmu)
+{
+ intel_pmu_init_grt(pmu);
+ hybrid(pmu, event_constraints) = intel_skt_event_constraints;
+ hybrid(pmu, extra_regs) = intel_cmt_extra_regs;
+}
+
__init int intel_pmu_init(void)
{
struct attribute **extra_skl_attr = &empty_attrs;
@@ -6193,14 +6397,14 @@ __init int intel_pmu_init(void)
x86_pmu = intel_pmu;
x86_pmu.version = version;
- x86_pmu.num_counters = eax.split.num_counters;
+ x86_pmu.cntr_mask64 = GENMASK_ULL(eax.split.num_counters - 1, 0);
x86_pmu.cntval_bits = eax.split.bit_width;
x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
x86_pmu.events_maskl = ebx.full;
x86_pmu.events_mask_len = eax.split.mask_length;
- x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+ x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(x86_pmu.cntr_mask64);
x86_pmu.pebs_capable = PEBS_COUNTER_MASK;
/*
@@ -6210,12 +6414,10 @@ __init int intel_pmu_init(void)
if (version > 1 && version < 5) {
int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
- x86_pmu.num_counters_fixed =
- max((int)edx.split.num_counters_fixed, assume);
-
- fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1;
+ x86_pmu.fixed_cntr_mask64 =
+ GENMASK_ULL(max((int)edx.split.num_counters_fixed, assume) - 1, 0);
} else if (version >= 5)
- x86_pmu.num_counters_fixed = fls(fixed_mask);
+ x86_pmu.fixed_cntr_mask64 = fixed_mask;
if (boot_cpu_has(X86_FEATURE_PDCM)) {
u64 capabilities;
@@ -6245,19 +6447,19 @@ __init int intel_pmu_init(void)
/*
* Install the hw-cache-events table:
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_CORE_YONAH:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_CORE_YONAH:
pr_cont("Core events, ");
name = "core";
break;
- case INTEL_FAM6_CORE2_MEROM:
+ case INTEL_CORE2_MEROM:
x86_add_quirk(intel_clovertown_quirk);
fallthrough;
- case INTEL_FAM6_CORE2_MEROM_L:
- case INTEL_FAM6_CORE2_PENRYN:
- case INTEL_FAM6_CORE2_DUNNINGTON:
+ case INTEL_CORE2_MEROM_L:
+ case INTEL_CORE2_PENRYN:
+ case INTEL_CORE2_DUNNINGTON:
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -6269,9 +6471,9 @@ __init int intel_pmu_init(void)
name = "core2";
break;
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_NEHALEM_EP:
- case INTEL_FAM6_NEHALEM_EX:
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -6303,11 +6505,11 @@ __init int intel_pmu_init(void)
name = "nehalem";
break;
- case INTEL_FAM6_ATOM_BONNELL:
- case INTEL_FAM6_ATOM_BONNELL_MID:
- case INTEL_FAM6_ATOM_SALTWELL:
- case INTEL_FAM6_ATOM_SALTWELL_MID:
- case INTEL_FAM6_ATOM_SALTWELL_TABLET:
+ case INTEL_ATOM_BONNELL:
+ case INTEL_ATOM_BONNELL_MID:
+ case INTEL_ATOM_SALTWELL:
+ case INTEL_ATOM_SALTWELL_MID:
+ case INTEL_ATOM_SALTWELL_TABLET:
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -6320,11 +6522,11 @@ __init int intel_pmu_init(void)
name = "bonnell";
break;
- case INTEL_FAM6_ATOM_SILVERMONT:
- case INTEL_FAM6_ATOM_SILVERMONT_D:
- case INTEL_FAM6_ATOM_SILVERMONT_MID:
- case INTEL_FAM6_ATOM_AIRMONT:
- case INTEL_FAM6_ATOM_AIRMONT_MID:
+ case INTEL_ATOM_SILVERMONT:
+ case INTEL_ATOM_SILVERMONT_D:
+ case INTEL_ATOM_SILVERMONT_MID:
+ case INTEL_ATOM_AIRMONT:
+ case INTEL_ATOM_AIRMONT_MID:
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
@@ -6342,8 +6544,8 @@ __init int intel_pmu_init(void)
name = "silvermont";
break;
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_D:
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_D:
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
@@ -6369,7 +6571,7 @@ __init int intel_pmu_init(void)
name = "goldmont";
break;
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ case INTEL_ATOM_GOLDMONT_PLUS:
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
@@ -6398,9 +6600,9 @@ __init int intel_pmu_init(void)
name = "goldmont_plus";
break;
- case INTEL_FAM6_ATOM_TREMONT_D:
- case INTEL_FAM6_ATOM_TREMONT:
- case INTEL_FAM6_ATOM_TREMONT_L:
+ case INTEL_ATOM_TREMONT_D:
+ case INTEL_ATOM_TREMONT:
+ case INTEL_ATOM_TREMONT_L:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -6427,10 +6629,10 @@ __init int intel_pmu_init(void)
name = "Tremont";
break;
- case INTEL_FAM6_ATOM_GRACEMONT:
+ case INTEL_ATOM_GRACEMONT:
intel_pmu_init_grt(NULL);
intel_pmu_pebs_data_source_grt();
- x86_pmu.pebs_latency_data = adl_latency_data_small;
+ x86_pmu.pebs_latency_data = grt_latency_data;
x86_pmu.get_event_constraints = tnt_get_event_constraints;
td_attr = tnt_events_attrs;
mem_attr = grt_mem_attrs;
@@ -6439,12 +6641,12 @@ __init int intel_pmu_init(void)
name = "gracemont";
break;
- case INTEL_FAM6_ATOM_CRESTMONT:
- case INTEL_FAM6_ATOM_CRESTMONT_X:
+ case INTEL_ATOM_CRESTMONT:
+ case INTEL_ATOM_CRESTMONT_X:
intel_pmu_init_grt(NULL);
x86_pmu.extra_regs = intel_cmt_extra_regs;
intel_pmu_pebs_data_source_cmt();
- x86_pmu.pebs_latency_data = mtl_latency_data_small;
+ x86_pmu.pebs_latency_data = cmt_latency_data;
x86_pmu.get_event_constraints = cmt_get_event_constraints;
td_attr = cmt_events_attrs;
mem_attr = grt_mem_attrs;
@@ -6453,9 +6655,9 @@ __init int intel_pmu_init(void)
name = "crestmont";
break;
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_WESTMERE_EP:
- case INTEL_FAM6_WESTMERE_EX:
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_WESTMERE_EX:
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -6484,8 +6686,8 @@ __init int intel_pmu_init(void)
name = "westmere";
break;
- case INTEL_FAM6_SANDYBRIDGE:
- case INTEL_FAM6_SANDYBRIDGE_X:
+ case INTEL_SANDYBRIDGE:
+ case INTEL_SANDYBRIDGE_X:
x86_add_quirk(intel_sandybridge_quirk);
x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
@@ -6498,7 +6700,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_snb_event_constraints;
x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
- if (boot_cpu_data.x86_model == INTEL_FAM6_SANDYBRIDGE_X)
+ if (boot_cpu_data.x86_vfm == INTEL_SANDYBRIDGE_X)
x86_pmu.extra_regs = intel_snbep_extra_regs;
else
x86_pmu.extra_regs = intel_snb_extra_regs;
@@ -6524,8 +6726,8 @@ __init int intel_pmu_init(void)
name = "sandybridge";
break;
- case INTEL_FAM6_IVYBRIDGE:
- case INTEL_FAM6_IVYBRIDGE_X:
+ case INTEL_IVYBRIDGE:
+ case INTEL_IVYBRIDGE_X:
x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -6541,7 +6743,7 @@ __init int intel_pmu_init(void)
x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
x86_pmu.pebs_prec_dist = true;
- if (boot_cpu_data.x86_model == INTEL_FAM6_IVYBRIDGE_X)
+ if (boot_cpu_data.x86_vfm == INTEL_IVYBRIDGE_X)
x86_pmu.extra_regs = intel_snbep_extra_regs;
else
x86_pmu.extra_regs = intel_snb_extra_regs;
@@ -6563,10 +6765,10 @@ __init int intel_pmu_init(void)
break;
- case INTEL_FAM6_HASWELL:
- case INTEL_FAM6_HASWELL_X:
- case INTEL_FAM6_HASWELL_L:
- case INTEL_FAM6_HASWELL_G:
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_X:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
x86_add_quirk(intel_ht_bug);
x86_add_quirk(intel_pebs_isolation_quirk);
x86_pmu.late_ack = true;
@@ -6586,6 +6788,7 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.limit_period = hsw_limit_period;
x86_pmu.lbr_double_abort = true;
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr;
@@ -6596,10 +6799,10 @@ __init int intel_pmu_init(void)
name = "haswell";
break;
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_D:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_BROADWELL_X:
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_D:
+ case INTEL_BROADWELL_G:
+ case INTEL_BROADWELL_X:
x86_add_quirk(intel_pebs_isolation_quirk);
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
@@ -6638,8 +6841,8 @@ __init int intel_pmu_init(void)
name = "broadwell";
break;
- case INTEL_FAM6_XEON_PHI_KNL:
- case INTEL_FAM6_XEON_PHI_KNM:
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
memcpy(hw_cache_event_ids,
slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs,
@@ -6658,15 +6861,15 @@ __init int intel_pmu_init(void)
name = "knights-landing";
break;
- case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_SKYLAKE_X:
pmem = true;
fallthrough;
- case INTEL_FAM6_SKYLAKE_L:
- case INTEL_FAM6_SKYLAKE:
- case INTEL_FAM6_KABYLAKE_L:
- case INTEL_FAM6_KABYLAKE:
- case INTEL_FAM6_COMETLAKE_L:
- case INTEL_FAM6_COMETLAKE:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
+ case INTEL_COMETLAKE_L:
+ case INTEL_COMETLAKE:
x86_add_quirk(intel_pebs_isolation_quirk);
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
@@ -6715,16 +6918,16 @@ __init int intel_pmu_init(void)
name = "skylake";
break;
- case INTEL_FAM6_ICELAKE_X:
- case INTEL_FAM6_ICELAKE_D:
+ case INTEL_ICELAKE_X:
+ case INTEL_ICELAKE_D:
x86_pmu.pebs_ept = 1;
pmem = true;
fallthrough;
- case INTEL_FAM6_ICELAKE_L:
- case INTEL_FAM6_ICELAKE:
- case INTEL_FAM6_TIGERLAKE_L:
- case INTEL_FAM6_TIGERLAKE:
- case INTEL_FAM6_ROCKETLAKE:
+ case INTEL_ICELAKE_L:
+ case INTEL_ICELAKE:
+ case INTEL_TIGERLAKE_L:
+ case INTEL_TIGERLAKE:
+ case INTEL_ROCKETLAKE:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -6759,16 +6962,22 @@ __init int intel_pmu_init(void)
name = "icelake";
break;
- case INTEL_FAM6_SAPPHIRERAPIDS_X:
- case INTEL_FAM6_EMERALDRAPIDS_X:
+ case INTEL_SAPPHIRERAPIDS_X:
+ case INTEL_EMERALDRAPIDS_X:
x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
x86_pmu.extra_regs = intel_glc_extra_regs;
- fallthrough;
- case INTEL_FAM6_GRANITERAPIDS_X:
- case INTEL_FAM6_GRANITERAPIDS_D:
+ pr_cont("Sapphire Rapids events, ");
+ name = "sapphire_rapids";
+ goto glc_common;
+
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_GRANITERAPIDS_D:
+ x86_pmu.extra_regs = intel_rwc_extra_regs;
+ pr_cont("Granite Rapids events, ");
+ name = "granite_rapids";
+
+ glc_common:
intel_pmu_init_glc(NULL);
- if (!x86_pmu.extra_regs)
- x86_pmu.extra_regs = intel_rwc_extra_regs;
x86_pmu.pebs_ept = 1;
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = glc_get_event_constraints;
@@ -6779,15 +6988,13 @@ __init int intel_pmu_init(void)
td_attr = glc_td_events_attrs;
tsx_attr = glc_tsx_events_attrs;
intel_pmu_pebs_data_source_skl(true);
- pr_cont("Sapphire Rapids events, ");
- name = "sapphire_rapids";
break;
- case INTEL_FAM6_ALDERLAKE:
- case INTEL_FAM6_ALDERLAKE_L:
- case INTEL_FAM6_RAPTORLAKE:
- case INTEL_FAM6_RAPTORLAKE_P:
- case INTEL_FAM6_RAPTORLAKE_S:
+ case INTEL_ALDERLAKE:
+ case INTEL_ALDERLAKE_L:
+ case INTEL_RAPTORLAKE:
+ case INTEL_RAPTORLAKE_P:
+ case INTEL_RAPTORLAKE_S:
/*
* Alder Lake has 2 types of CPU, core and atom.
*
@@ -6795,7 +7002,7 @@ __init int intel_pmu_init(void)
*/
intel_pmu_init_hybrid(hybrid_big_small);
- x86_pmu.pebs_latency_data = adl_latency_data_small;
+ x86_pmu.pebs_latency_data = grt_latency_data;
x86_pmu.get_event_constraints = adl_get_event_constraints;
x86_pmu.hw_config = adl_hw_config;
x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type;
@@ -6810,11 +7017,13 @@ __init int intel_pmu_init(void)
pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
intel_pmu_init_glc(&pmu->pmu);
if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
- pmu->num_counters = x86_pmu.num_counters + 2;
- pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1;
+ pmu->cntr_mask64 <<= 2;
+ pmu->cntr_mask64 |= 0x3;
+ pmu->fixed_cntr_mask64 <<= 1;
+ pmu->fixed_cntr_mask64 |= 0x1;
} else {
- pmu->num_counters = x86_pmu.num_counters;
- pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
+ pmu->cntr_mask64 = x86_pmu.cntr_mask64;
+ pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64;
}
/*
@@ -6824,15 +7033,16 @@ __init int intel_pmu_init(void)
* mistakenly add extra counters for P-cores. Correct the number of
* counters here.
*/
- if ((pmu->num_counters > 8) || (pmu->num_counters_fixed > 4)) {
- pmu->num_counters = x86_pmu.num_counters;
- pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
+ if ((x86_pmu_num_counters(&pmu->pmu) > 8) || (x86_pmu_num_counters_fixed(&pmu->pmu) > 4)) {
+ pmu->cntr_mask64 = x86_pmu.cntr_mask64;
+ pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64;
}
- pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
+ pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64);
pmu->unconstrained = (struct event_constraint)
- __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
- 0, pmu->num_counters, 0, 0);
+ __EVENT_CONSTRAINT(0, pmu->cntr_mask64,
+ 0, x86_pmu_num_counters(&pmu->pmu), 0, 0);
+
pmu->extra_regs = intel_glc_extra_regs;
/* Initialize Atom core specific PerfMon capabilities.*/
@@ -6845,11 +7055,11 @@ __init int intel_pmu_init(void)
name = "alderlake_hybrid";
break;
- case INTEL_FAM6_METEORLAKE:
- case INTEL_FAM6_METEORLAKE_L:
+ case INTEL_METEORLAKE:
+ case INTEL_METEORLAKE_L:
intel_pmu_init_hybrid(hybrid_big_small);
- x86_pmu.pebs_latency_data = mtl_latency_data_small;
+ x86_pmu.pebs_latency_data = cmt_latency_data;
x86_pmu.get_event_constraints = mtl_get_event_constraints;
x86_pmu.hw_config = adl_hw_config;
@@ -6874,6 +7084,33 @@ __init int intel_pmu_init(void)
name = "meteorlake_hybrid";
break;
+ case INTEL_LUNARLAKE_M:
+ case INTEL_ARROWLAKE:
+ intel_pmu_init_hybrid(hybrid_big_small);
+
+ x86_pmu.pebs_latency_data = lnl_latency_data;
+ x86_pmu.get_event_constraints = mtl_get_event_constraints;
+ x86_pmu.hw_config = adl_hw_config;
+
+ td_attr = lnl_hybrid_events_attrs;
+ mem_attr = mtl_hybrid_mem_attrs;
+ tsx_attr = adl_hybrid_tsx_attrs;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr;
+
+ /* Initialize big core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
+ intel_pmu_init_lnc(&pmu->pmu);
+
+ /* Initialize Atom core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
+ intel_pmu_init_skt(&pmu->pmu);
+
+ intel_pmu_pebs_data_source_lnl();
+ pr_cont("Lunarlake Hybrid events, ");
+ name = "lunarlake_hybrid";
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -6899,9 +7136,9 @@ __init int intel_pmu_init(void)
* The constraints may be cut according to the CPUID enumeration
* by inserting the EVENT_CONSTRAINT_END.
*/
- if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED)
- x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
- intel_v5_gen_event_constraints[x86_pmu.num_counters_fixed].weight = -1;
+ if (fls64(x86_pmu.fixed_cntr_mask64) > INTEL_PMC_MAX_FIXED)
+ x86_pmu.fixed_cntr_mask64 &= GENMASK_ULL(INTEL_PMC_MAX_FIXED - 1, 0);
+ intel_v5_gen_event_constraints[fls64(x86_pmu.fixed_cntr_mask64)].weight = -1;
x86_pmu.event_constraints = intel_v5_gen_event_constraints;
pr_cont("generic architected perfmon, ");
name = "generic_arch_v5+";
@@ -6928,18 +7165,17 @@ __init int intel_pmu_init(void)
x86_pmu.attr_update = hybrid_attr_update;
}
- intel_pmu_check_num_counters(&x86_pmu.num_counters,
- &x86_pmu.num_counters_fixed,
- &x86_pmu.intel_ctrl,
- (u64)fixed_mask);
+ intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64,
+ &x86_pmu.fixed_cntr_mask64,
+ &x86_pmu.intel_ctrl);
/* AnyThread may be deprecated on arch perfmon v5 or later */
if (x86_pmu.intel_cap.anythread_deprecated)
x86_pmu.format_attrs = intel_arch_formats_attr;
intel_pmu_check_event_constraints(x86_pmu.event_constraints,
- x86_pmu.num_counters,
- x86_pmu.num_counters_fixed,
+ x86_pmu.cntr_mask64,
+ x86_pmu.fixed_cntr_mask64,
x86_pmu.intel_ctrl);
/*
* Access LBR MSR may cause #GP under certain circumstances.
@@ -6980,6 +7216,14 @@ __init int intel_pmu_init(void)
pr_cont("full-width counters, ");
}
+ /* Support V6+ MSR Aliasing */
+ if (x86_pmu.version >= 6) {
+ x86_pmu.perfctr = MSR_IA32_PMC_V6_GP0_CTR;
+ x86_pmu.eventsel = MSR_IA32_PMC_V6_GP0_CFG_A;
+ x86_pmu.fixedctr = MSR_IA32_PMC_V6_FX0_CTR;
+ x86_pmu.addr_offset = intel_pmu_v6_addr_offset;
+ }
+
if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics)
x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 326c8cd5aa2d..ae4ec16156bb 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -41,7 +41,7 @@
* MSR_CORE_C1_RES: CORE C1 Residency Counter
* perf code: 0x00
* Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL
- * MTL,SRF,GRR
+ * MTL,SRF,GRR,ARL,LNL
* Scope: Core (each processor core has a MSR)
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
@@ -53,50 +53,50 @@
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
* TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF,
- * GRR
+ * GRR,ARL,LNL
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
* Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
- * ICL,TGL,RKL,ADL,RPL,MTL
+ * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL
* Scope: Core
* MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
* KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL,
- * RPL,SPR,MTL
+ * RPL,SPR,MTL,ARL,LNL,SRF
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
* GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL,
- * ADL,RPL,MTL
+ * ADL,RPL,MTL,ARL,LNL
* Scope: Package (physical package)
* MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
- * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF
+ * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF,
+ * ARL,LNL
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL,
- * KBL,CML,ICL,TGL,RKL,ADL,RPL,MTL
+ * KBL,CML,ICL,TGL,RKL
* Scope: Package (physical package)
* MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
* perf code: 0x04
* Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
- * ADL,RPL,MTL
+ * ADL,RPL,MTL,ARL
* Scope: Package (physical package)
* MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
* perf code: 0x05
- * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
- * ADL,RPL,MTL
+ * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
* Scope: Package (physical package)
* MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
* perf code: 0x06
* Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
- * TNT,RKL,ADL,RPL,MTL
+ * TNT,RKL,ADL,RPL,MTL,ARL,LNL
* Scope: Package (physical package)
* MSR_MODULE_C6_RES_MS: Module C6 Residency Counter.
* perf code: 0x00
@@ -114,6 +114,7 @@
#include "../perf_event.h"
#include "../probe.h"
+MODULE_DESCRIPTION("Support for Intel cstate performance events");
MODULE_LICENSE("GPL");
#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \
@@ -127,10 +128,6 @@ static ssize_t __cstate_##_var##_show(struct device *dev, \
static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
- struct device_attribute *attr,
- char *buf);
-
/* Model -> events mapping */
struct cstate_model {
unsigned long core_events;
@@ -143,12 +140,6 @@ struct cstate_model {
#define SLM_PKG_C6_USE_C7_MSR (1UL << 0)
#define KNL_CORE_C6_MSR (1UL << 1)
-struct perf_cstate_msr {
- u64 msr;
- struct perf_pmu_events_attr *attr;
-};
-
-
/* cstate_core PMU */
static struct pmu cstate_core_pmu;
static bool has_cstate_core;
@@ -211,22 +202,9 @@ static struct attribute_group cstate_format_attr_group = {
.attrs = cstate_format_attrs,
};
-static cpumask_t cstate_core_cpu_mask;
-static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
-
-static struct attribute *cstate_cpumask_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL,
-};
-
-static struct attribute_group cpumask_attr_group = {
- .attrs = cstate_cpumask_attrs,
-};
-
static const struct attribute_group *cstate_attr_groups[] = {
&cstate_events_attr_group,
&cstate_format_attr_group,
- &cpumask_attr_group,
NULL,
};
@@ -274,8 +252,6 @@ static struct perf_msr pkg_msr[] = {
[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr },
};
-static cpumask_t cstate_pkg_cpu_mask;
-
/* cstate_module PMU */
static struct pmu cstate_module_pmu;
static bool has_cstate_module;
@@ -296,28 +272,9 @@ static struct perf_msr module_msr[] = {
[PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr },
};
-static cpumask_t cstate_module_cpu_mask;
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- struct pmu *pmu = dev_get_drvdata(dev);
-
- if (pmu == &cstate_core_pmu)
- return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
- else if (pmu == &cstate_pkg_pmu)
- return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
- else if (pmu == &cstate_module_pmu)
- return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask);
- else
- return 0;
-}
-
static int cstate_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config;
- int cpu;
if (event->attr.type != event->pmu->type)
return -ENOENT;
@@ -336,20 +293,13 @@ static int cstate_pmu_event_init(struct perf_event *event)
if (!(core_msr_mask & (1 << cfg)))
return -EINVAL;
event->hw.event_base = core_msr[cfg].msr;
- cpu = cpumask_any_and(&cstate_core_cpu_mask,
- topology_sibling_cpumask(event->cpu));
} else if (event->pmu == &cstate_pkg_pmu) {
if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
return -EINVAL;
cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
if (!(pkg_msr_mask & (1 << cfg)))
return -EINVAL;
-
- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
-
event->hw.event_base = pkg_msr[cfg].msr;
- cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
- topology_die_cpumask(event->cpu));
} else if (event->pmu == &cstate_module_pmu) {
if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX)
return -EINVAL;
@@ -357,16 +307,10 @@ static int cstate_pmu_event_init(struct perf_event *event)
if (!(module_msr_mask & (1 << cfg)))
return -EINVAL;
event->hw.event_base = module_msr[cfg].msr;
- cpu = cpumask_any_and(&cstate_module_cpu_mask,
- topology_cluster_cpumask(event->cpu));
} else {
return -ENOENT;
}
- if (cpu >= nr_cpu_ids)
- return -ENODEV;
-
- event->cpu = cpu;
event->hw.config = cfg;
event->hw.idx = -1;
return 0;
@@ -417,84 +361,6 @@ static int cstate_pmu_event_add(struct perf_event *event, int mode)
return 0;
}
-/*
- * Check if exiting cpu is the designated reader. If so migrate the
- * events when there is a valid target available
- */
-static int cstate_cpu_exit(unsigned int cpu)
-{
- unsigned int target;
-
- if (has_cstate_core &&
- cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) {
-
- target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
- /* Migrate events if there is a valid target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &cstate_core_cpu_mask);
- perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
- }
- }
-
- if (has_cstate_pkg &&
- cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) {
-
- target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
- /* Migrate events if there is a valid target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
- perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
- }
- }
-
- if (has_cstate_module &&
- cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) {
-
- target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu);
- /* Migrate events if there is a valid target */
- if (target < nr_cpu_ids) {
- cpumask_set_cpu(target, &cstate_module_cpu_mask);
- perf_pmu_migrate_context(&cstate_module_pmu, cpu, target);
- }
- }
- return 0;
-}
-
-static int cstate_cpu_init(unsigned int cpu)
-{
- unsigned int target;
-
- /*
- * If this is the first online thread of that core, set it in
- * the core cpu mask as the designated reader.
- */
- target = cpumask_any_and(&cstate_core_cpu_mask,
- topology_sibling_cpumask(cpu));
-
- if (has_cstate_core && target >= nr_cpu_ids)
- cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
-
- /*
- * If this is the first online thread of that package, set it
- * in the package cpu mask as the designated reader.
- */
- target = cpumask_any_and(&cstate_pkg_cpu_mask,
- topology_die_cpumask(cpu));
- if (has_cstate_pkg && target >= nr_cpu_ids)
- cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
-
- /*
- * If this is the first online thread of that cluster, set it
- * in the cluster cpu mask as the designated reader.
- */
- target = cpumask_any_and(&cstate_module_cpu_mask,
- topology_cluster_cpumask(cpu));
- if (has_cstate_module && target >= nr_cpu_ids)
- cpumask_set_cpu(cpu, &cstate_module_cpu_mask);
-
- return 0;
-}
-
static const struct attribute_group *core_attr_update[] = {
&group_cstate_core_c1,
&group_cstate_core_c3,
@@ -531,6 +397,7 @@ static struct pmu cstate_core_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .scope = PERF_PMU_SCOPE_CORE,
.module = THIS_MODULE,
};
@@ -546,6 +413,7 @@ static struct pmu cstate_pkg_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .scope = PERF_PMU_SCOPE_PKG,
.module = THIS_MODULE,
};
@@ -561,6 +429,7 @@ static struct pmu cstate_module_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .scope = PERF_PMU_SCOPE_CLUSTER,
.module = THIS_MODULE,
};
@@ -642,9 +511,18 @@ static const struct cstate_model adl_cstates __initconst = {
.pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
BIT(PERF_CSTATE_PKG_C3_RES) |
BIT(PERF_CSTATE_PKG_C6_RES) |
- BIT(PERF_CSTATE_PKG_C7_RES) |
BIT(PERF_CSTATE_PKG_C8_RES) |
- BIT(PERF_CSTATE_PKG_C9_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
+static const struct cstate_model lnl_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
BIT(PERF_CSTATE_PKG_C10_RES),
};
@@ -689,85 +567,90 @@ static const struct cstate_model srf_cstates __initconst = {
.core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
BIT(PERF_CSTATE_CORE_C6_RES),
- .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES),
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES),
.module_events = BIT(PERF_CSTATE_MODULE_C6_RES),
};
static const struct x86_cpu_id intel_cstates_match[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX, &nhm_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE, &nhm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP, &nhm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX, &nhm_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &hswult_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &slm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_D, &slm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &slm_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &hswult_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &hswult_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &hswult_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &hswult_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &cnl_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &knl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &knl_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &srf_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &grr_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &icx_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &adl_cstates),
+ X86_MATCH_VFM(INTEL_NEHALEM, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_NEHALEM_EP, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_NEHALEM_EX, &nhm_cstates),
+
+ X86_MATCH_VFM(INTEL_WESTMERE, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_WESTMERE_EP, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_WESTMERE_EX, &nhm_cstates),
+
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE, &snb_cstates),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_IVYBRIDGE, &snb_cstates),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_HASWELL, &snb_cstates),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &snb_cstates),
+ X86_MATCH_VFM(INTEL_HASWELL_G, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_HASWELL_L, &hswult_cstates),
+
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &slm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_D, &slm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &slm_cstates),
+
+ X86_MATCH_VFM(INTEL_BROADWELL, &snb_cstates),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &snb_cstates),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, &snb_cstates),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, &snb_cstates),
+ X86_MATCH_VFM(INTEL_SKYLAKE, &snb_cstates),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, &hswult_cstates),
+ X86_MATCH_VFM(INTEL_KABYLAKE, &hswult_cstates),
+ X86_MATCH_VFM(INTEL_COMETLAKE_L, &hswult_cstates),
+ X86_MATCH_VFM(INTEL_COMETLAKE, &hswult_cstates),
+
+ X86_MATCH_VFM(INTEL_CANNONLAKE_L, &cnl_cstates),
+
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &knl_cstates),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &knl_cstates),
+
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_L, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &srf_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &grr_cstates),
+
+ X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ICELAKE, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &icx_cstates),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &icx_cstates),
+
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, &icl_cstates),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ROCKETLAKE, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, &adl_cstates),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &adl_cstates),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_cstates),
+ X86_MATCH_VFM(INTEL_METEORLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ARROWLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_H, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_U, &adl_cstates),
+ X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
@@ -801,9 +684,6 @@ static int __init cstate_probe(const struct cstate_model *cm)
static inline void cstate_cleanup(void)
{
- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE);
- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING);
-
if (has_cstate_core)
perf_pmu_unregister(&cstate_core_pmu);
@@ -818,11 +698,6 @@ static int __init cstate_init(void)
{
int err;
- cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING,
- "perf/x86/cstate:starting", cstate_cpu_init, NULL);
- cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE,
- "perf/x86/cstate:online", NULL, cstate_cpu_exit);
-
if (has_cstate_core) {
err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
if (err) {
@@ -835,6 +710,8 @@ static int __init cstate_init(void)
if (has_cstate_pkg) {
if (topology_max_dies_per_package() > 1) {
+ /* CLX-AP is multi-die and the cstate is die-scope */
+ cstate_pkg_pmu.scope = PERF_PMU_SCOPE_DIE;
err = perf_pmu_register(&cstate_pkg_pmu,
"cstate_die", -1);
} else {
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e010bfed8417..fa5ea65de0d0 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -63,6 +63,15 @@ union intel_x86_pebs_dse {
unsigned int mtl_fwd_blk:1;
unsigned int ld_reserved4:24;
};
+ struct {
+ unsigned int lnc_dse:8;
+ unsigned int ld_reserved5:2;
+ unsigned int lnc_stlb_miss:1;
+ unsigned int lnc_locked:1;
+ unsigned int lnc_data_blk:1;
+ unsigned int lnc_addr_blk:1;
+ unsigned int ld_reserved6:18;
+ };
};
@@ -77,7 +86,7 @@ union intel_x86_pebs_dse {
#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
/* Version for Sandy Bridge and later */
-static u64 pebs_data_source[] = {
+static u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX] = {
P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */
OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
@@ -173,6 +182,40 @@ void __init intel_pmu_pebs_data_source_cmt(void)
__intel_pmu_pebs_data_source_cmt(pebs_data_source);
}
+/* Version for Lion Cove and later */
+static u64 lnc_pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX] = {
+ P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA), /* 0x00: ukn L3 */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 hit */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */
+ OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: LFB/L1 Miss Handling Buffer hit */
+ 0, /* 0x04: Reserved */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x05: L2 Hit */
+ OP_LH | LEVEL(L2_MHB) | P(SNOOP, NONE), /* 0x06: L2 Miss Handling Buffer Hit */
+ 0, /* 0x07: Reserved */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x08: L3 Hit */
+ 0, /* 0x09: Reserved */
+ 0, /* 0x0a: Reserved */
+ 0, /* 0x0b: Reserved */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD), /* 0x0c: L3 Hit Snoop Fwd */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x0d: L3 Hit Snoop HitM */
+ 0, /* 0x0e: Reserved */
+ P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x0f: L3 Miss Snoop HitM */
+ OP_LH | LEVEL(MSC) | P(SNOOP, NONE), /* 0x10: Memory-side Cache Hit */
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, NONE), /* 0x11: Local Memory Hit */
+};
+
+void __init intel_pmu_pebs_data_source_lnl(void)
+{
+ u64 *data_source;
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source;
+ memcpy(data_source, lnc_pebs_data_source, sizeof(lnc_pebs_data_source));
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ __intel_pmu_pebs_data_source_cmt(data_source);
+}
+
static u64 precise_store_data(u64 status)
{
union intel_x86_pebs_dse dse;
@@ -257,14 +300,14 @@ static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock)
}
/* Retrieve the latency data for e-core of ADL */
-static u64 __adl_latency_data_small(struct perf_event *event, u64 status,
- u8 dse, bool tlb, bool lock, bool blk)
+static u64 __grt_latency_data(struct perf_event *event, u64 status,
+ u8 dse, bool tlb, bool lock, bool blk)
{
u64 val;
WARN_ON_ONCE(hybrid_pmu(event->pmu)->pmu_type == hybrid_big);
- dse &= PERF_PEBS_DATA_SOURCE_MASK;
+ dse &= PERF_PEBS_DATA_SOURCE_GRT_MASK;
val = hybrid_var(event->pmu, pebs_data_source)[dse];
pebs_set_tlb_lock(&val, tlb, lock);
@@ -277,27 +320,72 @@ static u64 __adl_latency_data_small(struct perf_event *event, u64 status,
return val;
}
-u64 adl_latency_data_small(struct perf_event *event, u64 status)
+u64 grt_latency_data(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
dse.val = status;
- return __adl_latency_data_small(event, status, dse.ld_dse,
- dse.ld_locked, dse.ld_stlb_miss,
- dse.ld_data_blk);
+ return __grt_latency_data(event, status, dse.ld_dse,
+ dse.ld_locked, dse.ld_stlb_miss,
+ dse.ld_data_blk);
}
/* Retrieve the latency data for e-core of MTL */
-u64 mtl_latency_data_small(struct perf_event *event, u64 status)
+u64 cmt_latency_data(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
dse.val = status;
- return __adl_latency_data_small(event, status, dse.mtl_dse,
- dse.mtl_stlb_miss, dse.mtl_locked,
- dse.mtl_fwd_blk);
+ return __grt_latency_data(event, status, dse.mtl_dse,
+ dse.mtl_stlb_miss, dse.mtl_locked,
+ dse.mtl_fwd_blk);
+}
+
+static u64 lnc_latency_data(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ union perf_mem_data_src src;
+ u64 val;
+
+ dse.val = status;
+
+ /* LNC core latency data */
+ val = hybrid_var(event->pmu, pebs_data_source)[status & PERF_PEBS_DATA_SOURCE_MASK];
+ if (!val)
+ val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA);
+
+ if (dse.lnc_stlb_miss)
+ val |= P(TLB, MISS) | P(TLB, L2);
+ else
+ val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+ if (dse.lnc_locked)
+ val |= P(LOCK, LOCKED);
+
+ if (dse.lnc_data_blk)
+ val |= P(BLK, DATA);
+ if (dse.lnc_addr_blk)
+ val |= P(BLK, ADDR);
+ if (!dse.lnc_data_blk && !dse.lnc_addr_blk)
+ val |= P(BLK, NA);
+
+ src.val = val;
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+ src.mem_op = P(OP, STORE);
+
+ return src.val;
+}
+
+u64 lnl_latency_data(struct perf_event *event, u64 status)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->pmu_type == hybrid_small)
+ return cmt_latency_data(event, status);
+
+ return lnc_latency_data(event, status);
}
static u64 load_latency_data(struct perf_event *event, u64 status)
@@ -1086,6 +1174,32 @@ struct event_constraint intel_glc_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_lnc_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
+
+ INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3ff),
+ INTEL_HYBRID_STLAT_CONSTRAINT(0x2cd, 0x3),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),
+
+ /*
+ * Everything else is handled by PMU_FL_PEBS_ALL, because we
+ * need the full constraints from the main table.
+ */
+
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
struct event_constraint *pebs_constraints = hybrid(event->pmu, pebs_constraints);
@@ -1137,8 +1251,7 @@ void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sche
static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
{
struct debug_store *ds = cpuc->ds;
- int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events);
- int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
+ int max_pebs_events = intel_pmu_max_num_pebs(cpuc->pmu);
u64 threshold;
int reserved;
@@ -1146,7 +1259,7 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
return;
if (x86_pmu.flags & PMU_FL_PEBS_ALL)
- reserved = max_pebs_events + num_counters_fixed;
+ reserved = max_pebs_events + x86_pmu_max_num_counters_fixed(cpuc->pmu);
else
reserved = max_pebs_events;
@@ -1831,8 +1944,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
set_linear_ip(regs, basic->ip);
regs->flags = PERF_EFLAGS_EXACT;
- if ((sample_type & PERF_SAMPLE_WEIGHT_STRUCT) && (x86_pmu.flags & PMU_FL_RETIRE_LATENCY))
- data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
+ if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
+ if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY)
+ data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
+ else
+ data->weight.var3_w = 0;
+ }
/*
* The record for MEMINFO is in front of GP
@@ -2157,6 +2274,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
void *base, *at, *top;
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
short error[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+ int max_pebs_events = intel_pmu_max_num_pebs(NULL);
int bit, i, size;
u64 mask;
@@ -2168,11 +2286,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
ds->pebs_index = ds->pebs_buffer_base;
- mask = (1ULL << x86_pmu.max_pebs_events) - 1;
- size = x86_pmu.max_pebs_events;
+ mask = x86_pmu.pebs_events_mask;
+ size = max_pebs_events;
if (x86_pmu.flags & PMU_FL_PEBS_ALL) {
- mask |= ((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED;
- size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed;
+ mask |= x86_pmu.fixed_cntr_mask64 << INTEL_PMC_IDX_FIXED;
+ size = INTEL_PMC_IDX_FIXED + x86_pmu_max_num_counters_fixed(NULL);
}
if (unlikely(base >= top)) {
@@ -2208,8 +2326,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
pebs_status = p->status = cpuc->pebs_enabled;
bit = find_first_bit((unsigned long *)&pebs_status,
- x86_pmu.max_pebs_events);
- if (bit >= x86_pmu.max_pebs_events)
+ max_pebs_events);
+
+ if (!(x86_pmu.pebs_events_mask & (1 << bit)))
continue;
/*
@@ -2267,12 +2386,10 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events);
- int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
struct debug_store *ds = cpuc->ds;
struct perf_event *event;
void *base, *at, *top;
- int bit, size;
+ int bit;
u64 mask;
if (!x86_pmu.pebs_active)
@@ -2283,12 +2400,11 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
ds->pebs_index = ds->pebs_buffer_base;
- mask = ((1ULL << max_pebs_events) - 1) |
- (((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED);
- size = INTEL_PMC_IDX_FIXED + num_counters_fixed;
+ mask = hybrid(cpuc->pmu, pebs_events_mask) |
+ (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED);
if (unlikely(base >= top)) {
- intel_pmu_pebs_event_update_no_drain(cpuc, size);
+ intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX);
return;
}
@@ -2298,11 +2414,11 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
pebs_status &= mask;
- for_each_set_bit(bit, (unsigned long *)&pebs_status, size)
+ for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX)
counts[bit]++;
}
- for_each_set_bit(bit, (unsigned long *)&mask, size) {
+ for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
if (counts[bit] == 0)
continue;
diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c
index 618001c208e8..034a1f6a457c 100644
--- a/arch/x86/events/intel/knc.c
+++ b/arch/x86/events/intel/knc.c
@@ -303,7 +303,7 @@ static const struct x86_pmu knc_pmu __initconst = {
.apic = 1,
.max_period = (1ULL << 39) - 1,
.version = 0,
- .num_counters = 2,
+ .cntr_mask64 = 0x3,
.cntval_bits = 40,
.cntval_mask = (1ULL << 40) - 1,
.get_event_constraints = x86_get_event_constraints,
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 4367aa77cb8d..dc641b50814e 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -2,6 +2,7 @@
#include <linux/perf_event.h>
#include <linux/types.h>
+#include <asm/cpu_device_id.h>
#include <asm/perf_event.h>
#include <asm/msr.h>
@@ -1457,7 +1458,7 @@ void __init intel_pmu_lbr_init_atom(void)
* to have an operational LBR which can freeze
* on PMU interrupt
*/
- if (boot_cpu_data.x86_model == 28
+ if (boot_cpu_data.x86_vfm == INTEL_ATOM_BONNELL
&& boot_cpu_data.x86_stepping < 10) {
pr_cont("LBR disabled due to erratum");
return;
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index 35936188db01..844bc4fc4724 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -919,7 +919,7 @@ static void p4_pmu_disable_all(void)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[idx];
if (!test_bit(idx, cpuc->active_mask))
continue;
@@ -998,7 +998,7 @@ static void p4_pmu_enable_all(int added)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[idx];
if (!test_bit(idx, cpuc->active_mask))
continue;
@@ -1040,7 +1040,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
cpuc = this_cpu_ptr(&cpu_hw_events);
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
int overflow;
if (!test_bit(idx, cpuc->active_mask)) {
@@ -1353,7 +1353,7 @@ static __initconst const struct x86_pmu p4_pmu = {
* though leave it restricted at moment assuming
* HT is on
*/
- .num_counters = ARCH_P4_MAX_CCCR,
+ .cntr_mask64 = GENMASK_ULL(ARCH_P4_MAX_CCCR - 1, 0),
.apic = 1,
.cntval_bits = ARCH_P4_CNTRVAL_BITS,
.cntval_mask = ARCH_P4_CNTRVAL_MASK,
@@ -1395,7 +1395,7 @@ __init int p4_pmu_init(void)
*
* Solve this by zero'ing out the registers to mimic a reset.
*/
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
reg = x86_pmu_config_addr(i);
wrmsrl_safe(reg, 0ULL);
}
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c
index 408879b0c0d4..a6cffb4f4ef5 100644
--- a/arch/x86/events/intel/p6.c
+++ b/arch/x86/events/intel/p6.c
@@ -214,7 +214,7 @@ static __initconst const struct x86_pmu p6_pmu = {
.apic = 1,
.max_period = (1ULL << 31) - 1,
.version = 0,
- .num_counters = 2,
+ .cntr_mask64 = 0x3,
/*
* Events have 40 bits implemented. However they are designed such
* that bits [32-39] are sign extensions of bit 31. As such the
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 8e2a12235e62..fd4670a6694e 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -22,7 +22,7 @@
#include <asm/insn.h>
#include <asm/io.h>
#include <asm/intel_pt.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include "../perf_event.h"
#include "pt.h"
@@ -211,11 +211,11 @@ static int __init pt_pmu_hw_init(void)
}
/* model-specific quirks */
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_D:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_D:
+ case INTEL_BROADWELL_G:
+ case INTEL_BROADWELL_X:
/* not setting BRANCH_EN will #GP, erratum BDM106 */
pt_pmu.branch_en_always_on = true;
break;
@@ -416,7 +416,7 @@ static bool pt_event_valid(struct perf_event *event)
static void pt_config_start(struct perf_event *event)
{
struct pt *pt = this_cpu_ptr(&pt_ctx);
- u64 ctl = event->hw.config;
+ u64 ctl = event->hw.aux_config;
ctl |= RTIT_CTL_TRACEEN;
if (READ_ONCE(pt->vmx_on))
@@ -424,7 +424,7 @@ static void pt_config_start(struct perf_event *event)
else
wrmsrl(MSR_IA32_RTIT_CTL, ctl);
- WRITE_ONCE(event->hw.config, ctl);
+ WRITE_ONCE(event->hw.aux_config, ctl);
}
/* Address ranges and their corresponding msr configuration registers */
@@ -503,7 +503,7 @@ static void pt_config(struct perf_event *event)
u64 reg;
/* First round: clear STATUS, in particular the PSB byte counter. */
- if (!event->hw.config) {
+ if (!event->hw.aux_config) {
perf_event_itrace_started(event);
wrmsrl(MSR_IA32_RTIT_STATUS, 0);
}
@@ -533,14 +533,14 @@ static void pt_config(struct perf_event *event)
reg |= (event->attr.config & PT_CONFIG_MASK);
- event->hw.config = reg;
+ event->hw.aux_config = reg;
pt_config_start(event);
}
static void pt_config_stop(struct perf_event *event)
{
struct pt *pt = this_cpu_ptr(&pt_ctx);
- u64 ctl = READ_ONCE(event->hw.config);
+ u64 ctl = READ_ONCE(event->hw.aux_config);
/* may be already stopped by a PMI */
if (!(ctl & RTIT_CTL_TRACEEN))
@@ -550,7 +550,7 @@ static void pt_config_stop(struct perf_event *event)
if (!READ_ONCE(pt->vmx_on))
wrmsrl(MSR_IA32_RTIT_CTL, ctl);
- WRITE_ONCE(event->hw.config, ctl);
+ WRITE_ONCE(event->hw.aux_config, ctl);
/*
* A wrmsr that disables trace generation serializes other PT
@@ -878,7 +878,7 @@ static void pt_update_head(struct pt *pt)
*/
static void *pt_buffer_region(struct pt_buffer *buf)
{
- return phys_to_virt(TOPA_ENTRY(buf->cur, buf->cur_idx)->base << TOPA_SHIFT);
+ return phys_to_virt((phys_addr_t)TOPA_ENTRY(buf->cur, buf->cur_idx)->base << TOPA_SHIFT);
}
/**
@@ -990,7 +990,7 @@ pt_topa_entry_for_page(struct pt_buffer *buf, unsigned int pg)
* order allocations, there shouldn't be many of these.
*/
list_for_each_entry(topa, &buf->tables, list) {
- if (topa->offset + topa->size > pg << PAGE_SHIFT)
+ if (topa->offset + topa->size > (unsigned long)pg << PAGE_SHIFT)
goto found;
}
@@ -1557,7 +1557,7 @@ void intel_pt_handle_vmx(int on)
/* Turn PTs back on */
if (!on && event)
- wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config);
+ wrmsrl(MSR_IA32_RTIT_CTL, event->hw.aux_config);
local_irq_restore(flags);
}
@@ -1606,6 +1606,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
* see comment in intel_pt_interrupt().
*/
WRITE_ONCE(pt->handle_nmi, 0);
+ barrier();
pt_config_stop(event);
@@ -1657,11 +1658,10 @@ static long pt_event_snapshot_aux(struct perf_event *event,
return 0;
/*
- * Here, handle_nmi tells us if the tracing is on
+ * There is no PT interrupt in this mode, so stop the trace and it will
+ * remain stopped while the buffer is copied.
*/
- if (READ_ONCE(pt->handle_nmi))
- pt_config_stop(event);
-
+ pt_config_stop(event);
pt_read_offset(buf);
pt_update_head(pt);
@@ -1673,11 +1673,10 @@ static long pt_event_snapshot_aux(struct perf_event *event,
ret = perf_output_copy_aux(&pt->handle, handle, from, to);
/*
- * If the tracing was on when we turned up, restart it.
- * Compiler barrier not needed as we couldn't have been
- * preempted by anything that touches pt->handle_nmi.
+ * Here, handle_nmi tells us if the tracing was on.
+ * If the tracing was on, restart it.
*/
- if (pt->handle_nmi)
+ if (READ_ONCE(pt->handle_nmi))
pt_config_start(event);
return ret;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 96906a62aacd..f5e46c04c145 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -33,8 +33,8 @@ struct topa_entry {
u64 rsvd2 : 1;
u64 size : 4;
u64 rsvd3 : 2;
- u64 base : 36;
- u64 rsvd4 : 16;
+ u64 base : 40;
+ u64 rsvd4 : 12;
};
/* TSC to Core Crystal Clock Ratio */
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 258e2cdf28fa..d98fac567684 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -34,6 +34,7 @@ static struct event_constraint uncore_constraint_fixed =
struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
+MODULE_DESCRIPTION("Support for Intel uncore performance events");
MODULE_LICENSE("GPL");
int uncore_pcibus_to_dieid(struct pci_bus *bus)
@@ -263,6 +264,9 @@ static void uncore_assign_hw_event(struct intel_uncore_box *box,
return;
}
+ if (intel_generic_uncore_assign_hw_event(event, box))
+ return;
+
hwc->config_base = uncore_event_ctl(box, hwc->idx);
hwc->event_base = uncore_perf_ctr(box, hwc->idx);
}
@@ -843,7 +847,9 @@ static void uncore_pmu_disable(struct pmu *pmu)
static ssize_t uncore_get_attr_cpumask(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
+ struct intel_uncore_pmu *pmu = container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu);
+
+ return cpumap_print_to_pagebuf(true, buf, &pmu->cpu_mask);
}
static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
@@ -860,7 +866,10 @@ static const struct attribute_group uncore_pmu_attr_group = {
static inline int uncore_get_box_id(struct intel_uncore_type *type,
struct intel_uncore_pmu *pmu)
{
- return type->box_ids ? type->box_ids[pmu->pmu_idx] : pmu->pmu_idx;
+ if (type->boxes)
+ return intel_uncore_find_discovery_unit_id(type->boxes, -1, pmu->pmu_idx);
+
+ return pmu->pmu_idx;
}
void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu)
@@ -961,6 +970,9 @@ static void uncore_type_exit(struct intel_uncore_type *type)
if (type->cleanup_mapping)
type->cleanup_mapping(type);
+ if (type->cleanup_extra_boxes)
+ type->cleanup_extra_boxes(type);
+
if (pmu) {
for (i = 0; i < type->num_boxes; i++, pmu++) {
uncore_pmu_unregister(pmu);
@@ -969,10 +981,7 @@ static void uncore_type_exit(struct intel_uncore_type *type)
kfree(type->pmus);
type->pmus = NULL;
}
- if (type->box_ids) {
- kfree(type->box_ids);
- type->box_ids = NULL;
- }
+
kfree(type->events_group);
type->events_group = NULL;
}
@@ -1076,22 +1085,19 @@ static struct intel_uncore_pmu *
uncore_pci_find_dev_pmu_from_types(struct pci_dev *pdev)
{
struct intel_uncore_type **types = uncore_pci_uncores;
+ struct intel_uncore_discovery_unit *unit;
struct intel_uncore_type *type;
- u64 box_ctl;
- int i, die;
+ struct rb_node *node;
for (; *types; types++) {
type = *types;
- for (die = 0; die < __uncore_max_dies; die++) {
- for (i = 0; i < type->num_boxes; i++) {
- if (!type->box_ctls[die])
- continue;
- box_ctl = type->box_ctls[die] + type->pci_offsets[i];
- if (pdev->devfn == UNCORE_DISCOVERY_PCI_DEVFN(box_ctl) &&
- pdev->bus->number == UNCORE_DISCOVERY_PCI_BUS(box_ctl) &&
- pci_domain_nr(pdev->bus) == UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl))
- return &type->pmus[i];
- }
+
+ for (node = rb_first(type->boxes); node; node = rb_next(node)) {
+ unit = rb_entry(node, struct intel_uncore_discovery_unit, node);
+ if (pdev->devfn == UNCORE_DISCOVERY_PCI_DEVFN(unit->addr) &&
+ pdev->bus->number == UNCORE_DISCOVERY_PCI_BUS(unit->addr) &&
+ pci_domain_nr(pdev->bus) == UNCORE_DISCOVERY_PCI_DOMAIN(unit->addr))
+ return &type->pmus[unit->pmu_idx];
}
}
@@ -1367,28 +1373,25 @@ static struct notifier_block uncore_pci_notifier = {
static void uncore_pci_pmus_register(void)
{
struct intel_uncore_type **types = uncore_pci_uncores;
+ struct intel_uncore_discovery_unit *unit;
struct intel_uncore_type *type;
struct intel_uncore_pmu *pmu;
+ struct rb_node *node;
struct pci_dev *pdev;
- u64 box_ctl;
- int i, die;
for (; *types; types++) {
type = *types;
- for (die = 0; die < __uncore_max_dies; die++) {
- for (i = 0; i < type->num_boxes; i++) {
- if (!type->box_ctls[die])
- continue;
- box_ctl = type->box_ctls[die] + type->pci_offsets[i];
- pdev = pci_get_domain_bus_and_slot(UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl),
- UNCORE_DISCOVERY_PCI_BUS(box_ctl),
- UNCORE_DISCOVERY_PCI_DEVFN(box_ctl));
- if (!pdev)
- continue;
- pmu = &type->pmus[i];
-
- uncore_pci_pmu_register(pdev, type, pmu, die);
- }
+
+ for (node = rb_first(type->boxes); node; node = rb_next(node)) {
+ unit = rb_entry(node, struct intel_uncore_discovery_unit, node);
+ pdev = pci_get_domain_bus_and_slot(UNCORE_DISCOVERY_PCI_DOMAIN(unit->addr),
+ UNCORE_DISCOVERY_PCI_BUS(unit->addr),
+ UNCORE_DISCOVERY_PCI_DEVFN(unit->addr));
+
+ if (!pdev)
+ continue;
+ pmu = &type->pmus[unit->pmu_idx];
+ uncore_pci_pmu_register(pdev, type, pmu, unit->die);
}
}
@@ -1453,6 +1456,18 @@ static void uncore_pci_exit(void)
}
}
+static bool uncore_die_has_box(struct intel_uncore_type *type,
+ int die, unsigned int pmu_idx)
+{
+ if (!type->boxes)
+ return true;
+
+ if (intel_uncore_find_discovery_unit_id(type->boxes, die, pmu_idx) < 0)
+ return false;
+
+ return true;
+}
+
static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
int new_cpu)
{
@@ -1468,18 +1483,25 @@ static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
if (old_cpu < 0) {
WARN_ON_ONCE(box->cpu != -1);
- box->cpu = new_cpu;
+ if (uncore_die_has_box(type, die, pmu->pmu_idx)) {
+ box->cpu = new_cpu;
+ cpumask_set_cpu(new_cpu, &pmu->cpu_mask);
+ }
continue;
}
- WARN_ON_ONCE(box->cpu != old_cpu);
+ WARN_ON_ONCE(box->cpu != -1 && box->cpu != old_cpu);
box->cpu = -1;
+ cpumask_clear_cpu(old_cpu, &pmu->cpu_mask);
if (new_cpu < 0)
continue;
+ if (!uncore_die_has_box(type, die, pmu->pmu_idx))
+ continue;
uncore_pmu_cancel_hrtimer(box);
perf_pmu_migrate_context(&pmu->pmu, old_cpu, new_cpu);
box->cpu = new_cpu;
+ cpumask_set_cpu(new_cpu, &pmu->cpu_mask);
}
}
@@ -1502,7 +1524,7 @@ static void uncore_box_unref(struct intel_uncore_type **types, int id)
pmu = type->pmus;
for (i = 0; i < type->num_boxes; i++, pmu++) {
box = pmu->boxes[id];
- if (box && atomic_dec_return(&box->refcnt) == 0)
+ if (box && box->cpu >= 0 && atomic_dec_return(&box->refcnt) == 0)
uncore_box_exit(box);
}
}
@@ -1592,7 +1614,7 @@ static int uncore_box_ref(struct intel_uncore_type **types,
pmu = type->pmus;
for (i = 0; i < type->num_boxes; i++, pmu++) {
box = pmu->boxes[id];
- if (box && atomic_inc_return(&box->refcnt) == 1)
+ if (box && box->cpu >= 0 && atomic_inc_return(&box->refcnt) == 1)
uncore_box_init(box);
}
}
@@ -1794,6 +1816,11 @@ static const struct intel_uncore_init_fun mtl_uncore_init __initconst = {
.mmio_init = adl_uncore_mmio_init,
};
+static const struct intel_uncore_init_fun lnl_uncore_init __initconst = {
+ .cpu_init = lnl_uncore_cpu_init,
+ .mmio_init = lnl_uncore_mmio_init,
+};
+
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1829,56 +1856,60 @@ static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
};
static const struct x86_cpu_id intel_uncore_match[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE, &nhm_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP, &nhm_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &snb_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &ivb_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &hsw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &hsw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &hsw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &bdw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &bdw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &snbep_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX, &nhmex_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX, &nhmex_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ivbep_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &hswep_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &bdx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &bdx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &knl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &knl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &mtl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &mtl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &gnr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &gnr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &gnr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_NEHALEM_EP, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_NEHALEM, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_WESTMERE, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_WESTMERE_EP, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE, &snb_uncore_init),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE, &ivb_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL, &hsw_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL_L, &hsw_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL_G, &hsw_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL, &bdw_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, &bdw_uncore_init),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &snbep_uncore_init),
+ X86_MATCH_VFM(INTEL_NEHALEM_EX, &nhmex_uncore_init),
+ X86_MATCH_VFM(INTEL_WESTMERE_EX, &nhmex_uncore_init),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ivbep_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &hswep_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &bdx_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &bdx_uncore_init),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &knl_uncore_init),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &knl_uncore_init),
+ X86_MATCH_VFM(INTEL_SKYLAKE, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &skx_uncore_init),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_KABYLAKE, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_COMETLAKE_L, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_COMETLAKE, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &icl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE, &icl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &icx_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &icx_uncore_init),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, &tgl_l_uncore_init),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, &tgl_uncore_init),
+ X86_MATCH_VFM(INTEL_ROCKETLAKE, &rkl_uncore_init),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_METEORLAKE, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_ARROWLAKE, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_U, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_H, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_uncore_init),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &snr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &gnr_uncore_init),
{},
};
MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 4838502d89ae..79ff32e13dcc 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -62,7 +62,6 @@ struct intel_uncore_type {
unsigned fixed_ctr;
unsigned fixed_ctl;
unsigned box_ctl;
- u64 *box_ctls; /* Unit ctrl addr of the first box of each die */
union {
unsigned msr_offset;
unsigned mmio_offset;
@@ -76,7 +75,6 @@ struct intel_uncore_type {
u64 *pci_offsets;
u64 *mmio_offsets;
};
- unsigned *box_ids;
struct event_constraint unconstrainted;
struct event_constraint *constraints;
struct intel_uncore_pmu *pmus;
@@ -86,6 +84,7 @@ struct intel_uncore_type {
const struct attribute_group *attr_groups[4];
const struct attribute_group **attr_update;
struct pmu *pmu; /* for custom pmu ops */
+ struct rb_root *boxes;
/*
* Uncore PMU would store relevant platform topology configuration here
* to identify which platform component each PMON block of that type is
@@ -98,6 +97,10 @@ struct intel_uncore_type {
int (*get_topology)(struct intel_uncore_type *type);
void (*set_mapping)(struct intel_uncore_type *type);
void (*cleanup_mapping)(struct intel_uncore_type *type);
+ /*
+ * Optional callbacks for extra uncore units cleanup
+ */
+ void (*cleanup_extra_boxes)(struct intel_uncore_type *type);
};
#define pmu_group attr_groups[0]
@@ -125,6 +128,7 @@ struct intel_uncore_pmu {
int func_id;
bool registered;
atomic_t activeboxes;
+ cpumask_t cpu_mask;
struct intel_uncore_type *type;
struct intel_uncore_box **boxes;
};
@@ -607,10 +611,12 @@ void skl_uncore_cpu_init(void);
void icl_uncore_cpu_init(void);
void tgl_uncore_cpu_init(void);
void adl_uncore_cpu_init(void);
+void lnl_uncore_cpu_init(void);
void mtl_uncore_cpu_init(void);
void tgl_uncore_mmio_init(void);
void tgl_l_uncore_mmio_init(void);
void adl_uncore_mmio_init(void);
+void lnl_uncore_mmio_init(void);
int snb_pci2phy_map_init(int devid);
/* uncore_snbep.c */
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 9a698a92962a..571e44b49691 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -89,9 +89,7 @@ add_uncore_discovery_type(struct uncore_unit_discovery *unit)
if (!type)
return NULL;
- type->box_ctrl_die = kcalloc(__uncore_max_dies, sizeof(u64), GFP_KERNEL);
- if (!type->box_ctrl_die)
- goto free_type;
+ type->units = RB_ROOT;
type->access_type = unit->access_type;
num_discovered_types[type->access_type]++;
@@ -100,12 +98,6 @@ add_uncore_discovery_type(struct uncore_unit_discovery *unit)
rb_add(&type->node, &discovery_tables, __type_less);
return type;
-
-free_type:
- kfree(type);
-
- return NULL;
-
}
static struct intel_uncore_discovery_type *
@@ -120,14 +112,118 @@ get_uncore_discovery_type(struct uncore_unit_discovery *unit)
return add_uncore_discovery_type(unit);
}
+static inline int pmu_idx_cmp(const void *key, const struct rb_node *b)
+{
+ struct intel_uncore_discovery_unit *unit;
+ const unsigned int *id = key;
+
+ unit = rb_entry(b, struct intel_uncore_discovery_unit, node);
+
+ if (unit->pmu_idx > *id)
+ return -1;
+ else if (unit->pmu_idx < *id)
+ return 1;
+
+ return 0;
+}
+
+static struct intel_uncore_discovery_unit *
+intel_uncore_find_discovery_unit(struct rb_root *units, int die,
+ unsigned int pmu_idx)
+{
+ struct intel_uncore_discovery_unit *unit;
+ struct rb_node *pos;
+
+ if (!units)
+ return NULL;
+
+ pos = rb_find_first(&pmu_idx, units, pmu_idx_cmp);
+ if (!pos)
+ return NULL;
+ unit = rb_entry(pos, struct intel_uncore_discovery_unit, node);
+
+ if (die < 0)
+ return unit;
+
+ for (; pos; pos = rb_next(pos)) {
+ unit = rb_entry(pos, struct intel_uncore_discovery_unit, node);
+
+ if (unit->pmu_idx != pmu_idx)
+ break;
+
+ if (unit->die == die)
+ return unit;
+ }
+
+ return NULL;
+}
+
+int intel_uncore_find_discovery_unit_id(struct rb_root *units, int die,
+ unsigned int pmu_idx)
+{
+ struct intel_uncore_discovery_unit *unit;
+
+ unit = intel_uncore_find_discovery_unit(units, die, pmu_idx);
+ if (unit)
+ return unit->id;
+
+ return -1;
+}
+
+static inline bool unit_less(struct rb_node *a, const struct rb_node *b)
+{
+ struct intel_uncore_discovery_unit *a_node, *b_node;
+
+ a_node = rb_entry(a, struct intel_uncore_discovery_unit, node);
+ b_node = rb_entry(b, struct intel_uncore_discovery_unit, node);
+
+ if (a_node->pmu_idx < b_node->pmu_idx)
+ return true;
+ if (a_node->pmu_idx > b_node->pmu_idx)
+ return false;
+
+ if (a_node->die < b_node->die)
+ return true;
+ if (a_node->die > b_node->die)
+ return false;
+
+ return 0;
+}
+
+static inline struct intel_uncore_discovery_unit *
+uncore_find_unit(struct rb_root *root, unsigned int id)
+{
+ struct intel_uncore_discovery_unit *unit;
+ struct rb_node *node;
+
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ unit = rb_entry(node, struct intel_uncore_discovery_unit, node);
+ if (unit->id == id)
+ return unit;
+ }
+
+ return NULL;
+}
+
+void uncore_find_add_unit(struct intel_uncore_discovery_unit *node,
+ struct rb_root *root, u16 *num_units)
+{
+ struct intel_uncore_discovery_unit *unit = uncore_find_unit(root, node->id);
+
+ if (unit)
+ node->pmu_idx = unit->pmu_idx;
+ else if (num_units)
+ node->pmu_idx = (*num_units)++;
+
+ rb_add(&node->node, root, unit_less);
+}
+
static void
uncore_insert_box_info(struct uncore_unit_discovery *unit,
- int die, bool parsed)
+ int die)
{
+ struct intel_uncore_discovery_unit *node;
struct intel_uncore_discovery_type *type;
- unsigned int *ids;
- u64 *box_offset;
- int i;
if (!unit->ctl || !unit->ctl_offset || !unit->ctr_offset) {
pr_info("Invalid address is detected for uncore type %d box %d, "
@@ -136,71 +232,29 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit,
return;
}
- if (parsed) {
- type = search_uncore_discovery_type(unit->box_type);
- if (!type) {
- pr_info("A spurious uncore type %d is detected, "
- "Disable the uncore type.\n",
- unit->box_type);
- return;
- }
- /* Store the first box of each die */
- if (!type->box_ctrl_die[die])
- type->box_ctrl_die[die] = unit->ctl;
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
return;
- }
- type = get_uncore_discovery_type(unit);
- if (!type)
- return;
+ node->die = die;
+ node->id = unit->box_id;
+ node->addr = unit->ctl;
- box_offset = kcalloc(type->num_boxes + 1, sizeof(u64), GFP_KERNEL);
- if (!box_offset)
+ type = get_uncore_discovery_type(unit);
+ if (!type) {
+ kfree(node);
return;
+ }
- ids = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL);
- if (!ids)
- goto free_box_offset;
+ uncore_find_add_unit(node, &type->units, &type->num_units);
/* Store generic information for the first box */
- if (!type->num_boxes) {
- type->box_ctrl = unit->ctl;
- type->box_ctrl_die[die] = unit->ctl;
+ if (type->num_units == 1) {
type->num_counters = unit->num_regs;
type->counter_width = unit->bit_width;
type->ctl_offset = unit->ctl_offset;
type->ctr_offset = unit->ctr_offset;
- *ids = unit->box_id;
- goto end;
- }
-
- for (i = 0; i < type->num_boxes; i++) {
- ids[i] = type->ids[i];
- box_offset[i] = type->box_offset[i];
-
- if (unit->box_id == ids[i]) {
- pr_info("Duplicate uncore type %d box ID %d is detected, "
- "Drop the duplicate uncore unit.\n",
- unit->box_type, unit->box_id);
- goto free_ids;
- }
}
- ids[i] = unit->box_id;
- box_offset[i] = unit->ctl - type->box_ctrl;
- kfree(type->ids);
- kfree(type->box_offset);
-end:
- type->ids = ids;
- type->box_offset = box_offset;
- type->num_boxes++;
- return;
-
-free_ids:
- kfree(ids);
-
-free_box_offset:
- kfree(box_offset);
-
}
static bool
@@ -279,7 +333,7 @@ static int parse_discovery_table(struct pci_dev *dev, int die,
if (uncore_ignore_unit(&unit, ignore))
continue;
- uncore_insert_box_info(&unit, die, *parsed);
+ uncore_insert_box_info(&unit, die);
}
*parsed = true;
@@ -339,9 +393,16 @@ err:
void intel_uncore_clear_discovery_tables(void)
{
struct intel_uncore_discovery_type *type, *next;
+ struct intel_uncore_discovery_unit *pos;
+ struct rb_node *node;
rbtree_postorder_for_each_entry_safe(type, next, &discovery_tables, node) {
- kfree(type->box_ctrl_die);
+ while (!RB_EMPTY_ROOT(&type->units)) {
+ node = rb_first(&type->units);
+ pos = rb_entry(node, struct intel_uncore_discovery_unit, node);
+ rb_erase(node, &type->units);
+ kfree(pos);
+ }
kfree(type);
}
}
@@ -366,19 +427,31 @@ static const struct attribute_group generic_uncore_format_group = {
.attrs = generic_uncore_formats_attr,
};
+static u64 intel_generic_uncore_box_ctl(struct intel_uncore_box *box)
+{
+ struct intel_uncore_discovery_unit *unit;
+
+ unit = intel_uncore_find_discovery_unit(box->pmu->type->boxes,
+ -1, box->pmu->pmu_idx);
+ if (WARN_ON_ONCE(!unit))
+ return 0;
+
+ return unit->addr;
+}
+
void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box)
{
- wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_INT);
+ wrmsrl(intel_generic_uncore_box_ctl(box), GENERIC_PMON_BOX_CTL_INT);
}
void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box)
{
- wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_FRZ);
+ wrmsrl(intel_generic_uncore_box_ctl(box), GENERIC_PMON_BOX_CTL_FRZ);
}
void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box)
{
- wrmsrl(uncore_msr_box_ctl(box), 0);
+ wrmsrl(intel_generic_uncore_box_ctl(box), 0);
}
static void intel_generic_uncore_msr_enable_event(struct intel_uncore_box *box,
@@ -406,10 +479,47 @@ static struct intel_uncore_ops generic_uncore_msr_ops = {
.read_counter = uncore_msr_read_counter,
};
+bool intel_generic_uncore_assign_hw_event(struct perf_event *event,
+ struct intel_uncore_box *box)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 box_ctl;
+
+ if (!box->pmu->type->boxes)
+ return false;
+
+ if (box->io_addr) {
+ hwc->config_base = uncore_pci_event_ctl(box, hwc->idx);
+ hwc->event_base = uncore_pci_perf_ctr(box, hwc->idx);
+ return true;
+ }
+
+ box_ctl = intel_generic_uncore_box_ctl(box);
+ if (!box_ctl)
+ return false;
+
+ if (box->pci_dev) {
+ box_ctl = UNCORE_DISCOVERY_PCI_BOX_CTRL(box_ctl);
+ hwc->config_base = box_ctl + uncore_pci_event_ctl(box, hwc->idx);
+ hwc->event_base = box_ctl + uncore_pci_perf_ctr(box, hwc->idx);
+ return true;
+ }
+
+ hwc->config_base = box_ctl + box->pmu->type->event_ctl + hwc->idx;
+ hwc->event_base = box_ctl + box->pmu->type->perf_ctr + hwc->idx;
+
+ return true;
+}
+
+static inline int intel_pci_uncore_box_ctl(struct intel_uncore_box *box)
+{
+ return UNCORE_DISCOVERY_PCI_BOX_CTRL(intel_generic_uncore_box_ctl(box));
+}
+
void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box)
{
struct pci_dev *pdev = box->pci_dev;
- int box_ctl = uncore_pci_box_ctl(box);
+ int box_ctl = intel_pci_uncore_box_ctl(box);
__set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_INT);
@@ -418,7 +528,7 @@ void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box)
void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box)
{
struct pci_dev *pdev = box->pci_dev;
- int box_ctl = uncore_pci_box_ctl(box);
+ int box_ctl = intel_pci_uncore_box_ctl(box);
pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_FRZ);
}
@@ -426,7 +536,7 @@ void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box)
void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box)
{
struct pci_dev *pdev = box->pci_dev;
- int box_ctl = uncore_pci_box_ctl(box);
+ int box_ctl = intel_pci_uncore_box_ctl(box);
pci_write_config_dword(pdev, box_ctl, 0);
}
@@ -473,34 +583,30 @@ static struct intel_uncore_ops generic_uncore_pci_ops = {
#define UNCORE_GENERIC_MMIO_SIZE 0x4000
-static u64 generic_uncore_mmio_box_ctl(struct intel_uncore_box *box)
-{
- struct intel_uncore_type *type = box->pmu->type;
-
- if (!type->box_ctls || !type->box_ctls[box->dieid] || !type->mmio_offsets)
- return 0;
-
- return type->box_ctls[box->dieid] + type->mmio_offsets[box->pmu->pmu_idx];
-}
-
void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
{
- u64 box_ctl = generic_uncore_mmio_box_ctl(box);
+ static struct intel_uncore_discovery_unit *unit;
struct intel_uncore_type *type = box->pmu->type;
resource_size_t addr;
- if (!box_ctl) {
+ unit = intel_uncore_find_discovery_unit(type->boxes, box->dieid, box->pmu->pmu_idx);
+ if (!unit) {
+ pr_warn("Uncore type %d id %d: Cannot find box control address.\n",
+ type->type_id, box->pmu->pmu_idx);
+ return;
+ }
+
+ if (!unit->addr) {
pr_warn("Uncore type %d box %d: Invalid box control address.\n",
- type->type_id, type->box_ids[box->pmu->pmu_idx]);
+ type->type_id, unit->id);
return;
}
- addr = box_ctl;
+ addr = unit->addr;
box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE);
if (!box->io_addr) {
pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n",
- type->type_id, type->box_ids[box->pmu->pmu_idx],
- (unsigned long long)addr);
+ type->type_id, unit->id, (unsigned long long)addr);
return;
}
@@ -560,34 +666,22 @@ static bool uncore_update_uncore_type(enum uncore_access_type type_id,
struct intel_uncore_discovery_type *type)
{
uncore->type_id = type->type;
- uncore->num_boxes = type->num_boxes;
uncore->num_counters = type->num_counters;
uncore->perf_ctr_bits = type->counter_width;
- uncore->box_ids = type->ids;
+ uncore->perf_ctr = (unsigned int)type->ctr_offset;
+ uncore->event_ctl = (unsigned int)type->ctl_offset;
+ uncore->boxes = &type->units;
+ uncore->num_boxes = type->num_units;
switch (type_id) {
case UNCORE_ACCESS_MSR:
uncore->ops = &generic_uncore_msr_ops;
- uncore->perf_ctr = (unsigned int)type->box_ctrl + type->ctr_offset;
- uncore->event_ctl = (unsigned int)type->box_ctrl + type->ctl_offset;
- uncore->box_ctl = (unsigned int)type->box_ctrl;
- uncore->msr_offsets = type->box_offset;
break;
case UNCORE_ACCESS_PCI:
uncore->ops = &generic_uncore_pci_ops;
- uncore->perf_ctr = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctr_offset;
- uncore->event_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctl_offset;
- uncore->box_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl);
- uncore->box_ctls = type->box_ctrl_die;
- uncore->pci_offsets = type->box_offset;
break;
case UNCORE_ACCESS_MMIO:
uncore->ops = &generic_uncore_mmio_ops;
- uncore->perf_ctr = (unsigned int)type->ctr_offset;
- uncore->event_ctl = (unsigned int)type->ctl_offset;
- uncore->box_ctl = (unsigned int)type->box_ctrl;
- uncore->box_ctls = type->box_ctrl_die;
- uncore->mmio_offsets = type->box_offset;
uncore->mmio_map_size = UNCORE_GENERIC_MMIO_SIZE;
break;
default:
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 22e769a81103..0e94aa7db8e7 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -113,19 +113,24 @@ struct uncore_unit_discovery {
};
};
+struct intel_uncore_discovery_unit {
+ struct rb_node node;
+ unsigned int pmu_idx; /* The idx of the corresponding PMU */
+ unsigned int id; /* Unit ID */
+ unsigned int die; /* Die ID */
+ u64 addr; /* Unit Control Address */
+};
+
struct intel_uncore_discovery_type {
struct rb_node node;
enum uncore_access_type access_type;
- u64 box_ctrl; /* Unit ctrl addr of the first box */
- u64 *box_ctrl_die; /* Unit ctrl addr of the first box of each die */
+ struct rb_root units; /* Unit ctrl addr for all units */
u16 type; /* Type ID of the uncore block */
u8 num_counters;
u8 counter_width;
u8 ctl_offset; /* Counter Control 0 offset */
u8 ctr_offset; /* Counter 0 offset */
- u16 num_boxes; /* number of boxes for the uncore block */
- unsigned int *ids; /* Box IDs */
- u64 *box_offset; /* Box offset */
+ u16 num_units; /* number of units */
};
bool intel_uncore_has_discovery_tables(int *ignore);
@@ -156,3 +161,10 @@ u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
struct intel_uncore_type **
intel_uncore_generic_init_uncores(enum uncore_access_type type_id, int num_extra);
+
+int intel_uncore_find_discovery_unit_id(struct rb_root *units, int die,
+ unsigned int pmu_idx);
+bool intel_generic_uncore_assign_hw_event(struct perf_event *event,
+ struct intel_uncore_box *box);
+void uncore_find_add_unit(struct intel_uncore_discovery_unit *node,
+ struct rb_root *root, u16 *num_units);
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index 92da8aaa5966..466833478e81 100644
--- a/arch/x86/events/intel/uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Nehalem-EX/Westmere-EX uncore support */
+#include <asm/cpu_device_id.h>
#include "uncore.h"
/* NHM-EX event control */
@@ -1217,7 +1218,7 @@ static struct intel_uncore_type *nhmex_msr_uncores[] = {
void nhmex_uncore_cpu_init(void)
{
- if (boot_cpu_data.x86_model == 46)
+ if (boot_cpu_data.x86_vfm == INTEL_NEHALEM_EX)
uncore_nhmex = true;
else
nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 9462fd9f3b7a..3934e1e4e3b1 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -252,6 +252,7 @@ DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
DEFINE_UNCORE_FORMAT_ATTR(threshold, threshold, "config:24-29");
+DEFINE_UNCORE_FORMAT_ATTR(threshold2, threshold, "config:24-31");
/* Sandy Bridge uncore support */
static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
@@ -746,6 +747,34 @@ void mtl_uncore_cpu_init(void)
uncore_msr_uncores = mtl_msr_uncores;
}
+static struct intel_uncore_type *lnl_msr_uncores[] = {
+ &mtl_uncore_cbox,
+ &mtl_uncore_arb,
+ NULL
+};
+
+#define LNL_UNC_MSR_GLOBAL_CTL 0x240e
+
+static void lnl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(LNL_UNC_MSR_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static struct intel_uncore_ops lnl_uncore_msr_ops = {
+ .init_box = lnl_uncore_msr_init_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+void lnl_uncore_cpu_init(void)
+{
+ mtl_uncore_cbox.num_boxes = 4;
+ mtl_uncore_cbox.ops = &lnl_uncore_msr_ops;
+ uncore_msr_uncores = lnl_msr_uncores;
+}
+
enum {
SNB_PCI_UNCORE_IMC,
};
@@ -1475,39 +1504,45 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void)
ids++;
}
+ /* Just try to grab 00:00.0 device */
+ if (!mc_dev)
+ mc_dev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0, 0));
+
return mc_dev;
}
#define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000
#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000
-static void __uncore_imc_init_box(struct intel_uncore_box *box,
- unsigned int base_offset)
+static void
+uncore_get_box_mmio_addr(struct intel_uncore_box *box,
+ unsigned int base_offset,
+ int bar_offset, int step)
{
struct pci_dev *pdev = tgl_uncore_get_mc_dev();
struct intel_uncore_pmu *pmu = box->pmu;
struct intel_uncore_type *type = pmu->type;
resource_size_t addr;
- u32 mch_bar;
+ u32 bar;
if (!pdev) {
pr_warn("perf uncore: Cannot find matched IMC device.\n");
return;
}
- pci_read_config_dword(pdev, SNB_UNCORE_PCI_IMC_BAR_OFFSET, &mch_bar);
- /* MCHBAR is disabled */
- if (!(mch_bar & BIT(0))) {
- pr_warn("perf uncore: MCHBAR is disabled. Failed to map IMC free-running counters.\n");
+ pci_read_config_dword(pdev, bar_offset, &bar);
+ if (!(bar & BIT(0))) {
+ pr_warn("perf uncore: BAR 0x%x is disabled. Failed to map %s counters.\n",
+ bar_offset, type->name);
pci_dev_put(pdev);
return;
}
- mch_bar &= ~BIT(0);
- addr = (resource_size_t)(mch_bar + TGL_UNCORE_MMIO_IMC_MEM_OFFSET * pmu->pmu_idx);
+ bar &= ~BIT(0);
+ addr = (resource_size_t)(bar + step * pmu->pmu_idx);
#ifdef CONFIG_PHYS_ADDR_T_64BIT
- pci_read_config_dword(pdev, SNB_UNCORE_PCI_IMC_BAR_OFFSET + 4, &mch_bar);
- addr |= ((resource_size_t)mch_bar << 32);
+ pci_read_config_dword(pdev, bar_offset + 4, &bar);
+ addr |= ((resource_size_t)bar << 32);
#endif
addr += base_offset;
@@ -1518,6 +1553,14 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box,
pci_dev_put(pdev);
}
+static void __uncore_imc_init_box(struct intel_uncore_box *box,
+ unsigned int base_offset)
+{
+ uncore_get_box_mmio_addr(box, base_offset,
+ SNB_UNCORE_PCI_IMC_BAR_OFFSET,
+ TGL_UNCORE_MMIO_IMC_MEM_OFFSET);
+}
+
static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
{
__uncore_imc_init_box(box, 0);
@@ -1612,14 +1655,17 @@ static void adl_uncore_mmio_enable_box(struct intel_uncore_box *box)
writel(0, box->io_addr + uncore_mmio_box_ctl(box));
}
+#define MMIO_UNCORE_COMMON_OPS() \
+ .exit_box = uncore_mmio_exit_box, \
+ .disable_box = adl_uncore_mmio_disable_box, \
+ .enable_box = adl_uncore_mmio_enable_box, \
+ .disable_event = intel_generic_uncore_mmio_disable_event, \
+ .enable_event = intel_generic_uncore_mmio_enable_event, \
+ .read_counter = uncore_mmio_read_counter,
+
static struct intel_uncore_ops adl_uncore_mmio_ops = {
.init_box = adl_uncore_imc_init_box,
- .exit_box = uncore_mmio_exit_box,
- .disable_box = adl_uncore_mmio_disable_box,
- .enable_box = adl_uncore_mmio_enable_box,
- .disable_event = intel_generic_uncore_mmio_disable_event,
- .enable_event = intel_generic_uncore_mmio_enable_event,
- .read_counter = uncore_mmio_read_counter,
+ MMIO_UNCORE_COMMON_OPS()
};
#define ADL_UNC_CTL_CHMASK_MASK 0x00000f00
@@ -1703,3 +1749,108 @@ void adl_uncore_mmio_init(void)
}
/* end of Alder Lake MMIO uncore support */
+
+/* Lunar Lake MMIO uncore support */
+#define LNL_UNCORE_PCI_SAFBAR_OFFSET 0x68
+#define LNL_UNCORE_MAP_SIZE 0x1000
+#define LNL_UNCORE_SNCU_BASE 0xE4B000
+#define LNL_UNCORE_SNCU_CTR 0x390
+#define LNL_UNCORE_SNCU_CTRL 0x398
+#define LNL_UNCORE_SNCU_BOX_CTL 0x380
+#define LNL_UNCORE_GLOBAL_CTL 0x700
+#define LNL_UNCORE_HBO_BASE 0xE54000
+#define LNL_UNCORE_HBO_OFFSET -4096
+#define LNL_UNCORE_HBO_CTR 0x570
+#define LNL_UNCORE_HBO_CTRL 0x550
+#define LNL_UNCORE_HBO_BOX_CTL 0x548
+
+#define LNL_UNC_CTL_THRESHOLD 0xff000000
+#define LNL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ LNL_UNC_CTL_THRESHOLD)
+
+static struct attribute *lnl_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_threshold2.attr,
+ NULL
+};
+
+static const struct attribute_group lnl_uncore_format_group = {
+ .name = "format",
+ .attrs = lnl_uncore_formats_attr,
+};
+
+static void lnl_uncore_hbo_init_box(struct intel_uncore_box *box)
+{
+ uncore_get_box_mmio_addr(box, LNL_UNCORE_HBO_BASE,
+ LNL_UNCORE_PCI_SAFBAR_OFFSET,
+ LNL_UNCORE_HBO_OFFSET);
+}
+
+static struct intel_uncore_ops lnl_uncore_hbo_ops = {
+ .init_box = lnl_uncore_hbo_init_box,
+ MMIO_UNCORE_COMMON_OPS()
+};
+
+static struct intel_uncore_type lnl_uncore_hbo = {
+ .name = "hbo",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 64,
+ .perf_ctr = LNL_UNCORE_HBO_CTR,
+ .event_ctl = LNL_UNCORE_HBO_CTRL,
+ .event_mask = LNL_UNC_RAW_EVENT_MASK,
+ .box_ctl = LNL_UNCORE_HBO_BOX_CTL,
+ .mmio_map_size = LNL_UNCORE_MAP_SIZE,
+ .ops = &lnl_uncore_hbo_ops,
+ .format_group = &lnl_uncore_format_group,
+};
+
+static void lnl_uncore_sncu_init_box(struct intel_uncore_box *box)
+{
+ uncore_get_box_mmio_addr(box, LNL_UNCORE_SNCU_BASE,
+ LNL_UNCORE_PCI_SAFBAR_OFFSET,
+ 0);
+
+ if (box->io_addr)
+ writel(ADL_UNCORE_IMC_CTL_INT, box->io_addr + LNL_UNCORE_GLOBAL_CTL);
+}
+
+static struct intel_uncore_ops lnl_uncore_sncu_ops = {
+ .init_box = lnl_uncore_sncu_init_box,
+ MMIO_UNCORE_COMMON_OPS()
+};
+
+static struct intel_uncore_type lnl_uncore_sncu = {
+ .name = "sncu",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 64,
+ .perf_ctr = LNL_UNCORE_SNCU_CTR,
+ .event_ctl = LNL_UNCORE_SNCU_CTRL,
+ .event_mask = LNL_UNC_RAW_EVENT_MASK,
+ .box_ctl = LNL_UNCORE_SNCU_BOX_CTL,
+ .mmio_map_size = LNL_UNCORE_MAP_SIZE,
+ .ops = &lnl_uncore_sncu_ops,
+ .format_group = &lnl_uncore_format_group,
+};
+
+static struct intel_uncore_type *lnl_mmio_uncores[] = {
+ &adl_uncore_imc,
+ &lnl_uncore_hbo,
+ &lnl_uncore_sncu,
+ &adl_uncore_imc_free_running,
+ NULL
+};
+
+void lnl_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = lnl_mmio_uncores;
+}
+
+/* end of Lunar Lake MMIO uncore support */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 2eaf0f339849..ca98744343b8 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* SandyBridge-EP/IvyTown uncore support */
+#include <asm/cpu_device_id.h>
#include "uncore.h"
#include "uncore_discovery.h"
@@ -461,6 +462,7 @@
#define SPR_UBOX_DID 0x3250
/* SPR CHA */
+#define SPR_CHA_EVENT_MASK_EXT 0xffffffff
#define SPR_CHA_PMON_CTL_TID_EN (1 << 16)
#define SPR_CHA_PMON_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
SPR_CHA_PMON_CTL_TID_EN)
@@ -477,6 +479,7 @@ DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55");
DEFINE_UNCORE_FORMAT_ATTR(umask_ext2, umask, "config:8-15,32-57");
DEFINE_UNCORE_FORMAT_ATTR(umask_ext3, umask, "config:8-15,32-39");
DEFINE_UNCORE_FORMAT_ATTR(umask_ext4, umask, "config:8-15,32-55");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext5, umask, "config:8-15,32-63");
DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
@@ -3285,7 +3288,7 @@ void bdx_uncore_cpu_init(void)
uncore_msr_uncores = bdx_msr_uncores;
/* Detect systems with no SBOXes */
- if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID))
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_D || hswep_has_limit_sbox(BDX_PCU_DID))
uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
@@ -5394,7 +5397,7 @@ static int icx_iio_get_topology(struct intel_uncore_type *type)
static void icx_iio_set_mapping(struct intel_uncore_type *type)
{
/* Detect ICX-D system. This case is not supported */
- if (boot_cpu_data.x86_model == INTEL_FAM6_ICELAKE_D) {
+ if (boot_cpu_data.x86_vfm == INTEL_ICELAKE_D) {
pmu_clear_mapping_attr(type->attr_update, &icx_iio_mapping_group);
return;
}
@@ -5932,10 +5935,11 @@ static int spr_cha_hw_config(struct intel_uncore_box *box, struct perf_event *ev
struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
bool tie_en = !!(event->hw.config & SPR_CHA_PMON_CTL_TID_EN);
struct intel_uncore_type *type = box->pmu->type;
+ int id = intel_uncore_find_discovery_unit_id(type->boxes, -1, box->pmu->pmu_idx);
if (tie_en) {
reg1->reg = SPR_C0_MSR_PMON_BOX_FILTER0 +
- HSWEP_CBO_MSR_OFFSET * type->box_ids[box->pmu->pmu_idx];
+ HSWEP_CBO_MSR_OFFSET * id;
reg1->config = event->attr.config1 & SPR_CHA_PMON_BOX_FILTER_TID;
reg1->idx = 0;
}
@@ -5957,7 +5961,7 @@ static struct intel_uncore_ops spr_uncore_chabox_ops = {
static struct attribute *spr_uncore_cha_formats_attr[] = {
&format_attr_event.attr,
- &format_attr_umask_ext4.attr,
+ &format_attr_umask_ext5.attr,
&format_attr_tid_en2.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
@@ -5993,7 +5997,7 @@ ATTRIBUTE_GROUPS(uncore_alias);
static struct intel_uncore_type spr_uncore_chabox = {
.name = "cha",
.event_mask = SPR_CHA_PMON_EVENT_MASK,
- .event_mask_ext = SPR_RAW_EVENT_MASK_EXT,
+ .event_mask_ext = SPR_CHA_EVENT_MASK_EXT,
.num_shared_regs = 1,
.constraints = skx_uncore_chabox_constraints,
.ops = &spr_uncore_chabox_ops,
@@ -6161,7 +6165,55 @@ static struct intel_uncore_type spr_uncore_mdf = {
.name = "mdf",
};
-#define UNCORE_SPR_NUM_UNCORE_TYPES 12
+static void spr_uncore_mmio_offs8_init_box(struct intel_uncore_box *box)
+{
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ intel_generic_uncore_mmio_init_box(box);
+}
+
+static struct intel_uncore_ops spr_uncore_mmio_offs8_ops = {
+ .init_box = spr_uncore_mmio_offs8_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = intel_generic_uncore_mmio_disable_box,
+ .enable_box = intel_generic_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = spr_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+#define SPR_UNCORE_MMIO_OFFS8_COMMON_FORMAT() \
+ SPR_UNCORE_COMMON_FORMAT(), \
+ .ops = &spr_uncore_mmio_offs8_ops
+
+static struct event_constraint spr_uncore_cxlcm_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x02, 0x0f),
+ UNCORE_EVENT_CONSTRAINT(0x05, 0x0f),
+ UNCORE_EVENT_CONSTRAINT(0x40, 0xf0),
+ UNCORE_EVENT_CONSTRAINT(0x41, 0xf0),
+ UNCORE_EVENT_CONSTRAINT(0x42, 0xf0),
+ UNCORE_EVENT_CONSTRAINT(0x43, 0xf0),
+ UNCORE_EVENT_CONSTRAINT(0x4b, 0xf0),
+ UNCORE_EVENT_CONSTRAINT(0x52, 0xf0),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type spr_uncore_cxlcm = {
+ SPR_UNCORE_MMIO_OFFS8_COMMON_FORMAT(),
+ .name = "cxlcm",
+ .constraints = spr_uncore_cxlcm_constraints,
+};
+
+static struct intel_uncore_type spr_uncore_cxldp = {
+ SPR_UNCORE_MMIO_OFFS8_COMMON_FORMAT(),
+ .name = "cxldp",
+};
+
+static struct intel_uncore_type spr_uncore_hbm = {
+ SPR_UNCORE_COMMON_FORMAT(),
+ .name = "hbm",
+};
+
+#define UNCORE_SPR_NUM_UNCORE_TYPES 15
#define UNCORE_SPR_CHA 0
#define UNCORE_SPR_IIO 1
#define UNCORE_SPR_IMC 6
@@ -6185,6 +6237,9 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
NULL,
NULL,
&spr_uncore_mdf,
+ &spr_uncore_cxlcm,
+ &spr_uncore_cxldp,
+ &spr_uncore_hbm,
};
/*
@@ -6197,6 +6252,24 @@ static u64 spr_upi_pci_offsets[SPR_UNCORE_UPI_NUM_BOXES] = {
0, 0x8000, 0x10000, 0x18000
};
+static void spr_extra_boxes_cleanup(struct intel_uncore_type *type)
+{
+ struct intel_uncore_discovery_unit *pos;
+ struct rb_node *node;
+
+ if (!type->boxes)
+ return;
+
+ while (!RB_EMPTY_ROOT(type->boxes)) {
+ node = rb_first(type->boxes);
+ pos = rb_entry(node, struct intel_uncore_discovery_unit, node);
+ rb_erase(node, type->boxes);
+ kfree(pos);
+ }
+ kfree(type->boxes);
+ type->boxes = NULL;
+}
+
static struct intel_uncore_type spr_uncore_upi = {
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
.event_mask_ext = SPR_RAW_EVENT_MASK_EXT,
@@ -6211,10 +6284,11 @@ static struct intel_uncore_type spr_uncore_upi = {
.num_counters = 4,
.num_boxes = SPR_UNCORE_UPI_NUM_BOXES,
.perf_ctr_bits = 48,
- .perf_ctr = ICX_UPI_PCI_PMON_CTR0,
- .event_ctl = ICX_UPI_PCI_PMON_CTL0,
+ .perf_ctr = ICX_UPI_PCI_PMON_CTR0 - ICX_UPI_PCI_PMON_BOX_CTL,
+ .event_ctl = ICX_UPI_PCI_PMON_CTL0 - ICX_UPI_PCI_PMON_BOX_CTL,
.box_ctl = ICX_UPI_PCI_PMON_BOX_CTL,
.pci_offsets = spr_upi_pci_offsets,
+ .cleanup_extra_boxes = spr_extra_boxes_cleanup,
};
static struct intel_uncore_type spr_uncore_m3upi = {
@@ -6224,11 +6298,12 @@ static struct intel_uncore_type spr_uncore_m3upi = {
.num_counters = 4,
.num_boxes = SPR_UNCORE_UPI_NUM_BOXES,
.perf_ctr_bits = 48,
- .perf_ctr = ICX_M3UPI_PCI_PMON_CTR0,
- .event_ctl = ICX_M3UPI_PCI_PMON_CTL0,
+ .perf_ctr = ICX_M3UPI_PCI_PMON_CTR0 - ICX_M3UPI_PCI_PMON_BOX_CTL,
+ .event_ctl = ICX_M3UPI_PCI_PMON_CTL0 - ICX_M3UPI_PCI_PMON_BOX_CTL,
.box_ctl = ICX_M3UPI_PCI_PMON_BOX_CTL,
.pci_offsets = spr_upi_pci_offsets,
.constraints = icx_uncore_m3upi_constraints,
+ .cleanup_extra_boxes = spr_extra_boxes_cleanup,
};
enum perf_uncore_spr_iio_freerunning_type_id {
@@ -6459,18 +6534,21 @@ uncore_find_type_by_id(struct intel_uncore_type **types, int type_id)
static int uncore_type_max_boxes(struct intel_uncore_type **types,
int type_id)
{
+ struct intel_uncore_discovery_unit *unit;
struct intel_uncore_type *type;
- int i, max = 0;
+ struct rb_node *node;
+ int max = 0;
type = uncore_find_type_by_id(types, type_id);
if (!type)
return 0;
- for (i = 0; i < type->num_boxes; i++) {
- if (type->box_ids[i] > max)
- max = type->box_ids[i];
- }
+ for (node = rb_first(type->boxes); node; node = rb_next(node)) {
+ unit = rb_entry(node, struct intel_uncore_discovery_unit, node);
+ if (unit->id > max)
+ max = unit->id;
+ }
return max + 1;
}
@@ -6512,10 +6590,11 @@ void spr_uncore_cpu_init(void)
static void spr_update_device_location(int type_id)
{
+ struct intel_uncore_discovery_unit *unit;
struct intel_uncore_type *type;
struct pci_dev *dev = NULL;
+ struct rb_root *root;
u32 device, devfn;
- u64 *ctls;
int die;
if (type_id == UNCORE_SPR_UPI) {
@@ -6529,27 +6608,35 @@ static void spr_update_device_location(int type_id)
} else
return;
- ctls = kcalloc(__uncore_max_dies, sizeof(u64), GFP_KERNEL);
- if (!ctls) {
+ root = kzalloc(sizeof(struct rb_root), GFP_KERNEL);
+ if (!root) {
type->num_boxes = 0;
return;
}
+ *root = RB_ROOT;
while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, dev)) != NULL) {
- if (devfn != dev->devfn)
- continue;
die = uncore_device_to_die(dev);
if (die < 0)
continue;
- ctls[die] = pci_domain_nr(dev->bus) << UNCORE_DISCOVERY_PCI_DOMAIN_OFFSET |
- dev->bus->number << UNCORE_DISCOVERY_PCI_BUS_OFFSET |
- devfn << UNCORE_DISCOVERY_PCI_DEVFN_OFFSET |
- type->box_ctl;
+ unit = kzalloc(sizeof(*unit), GFP_KERNEL);
+ if (!unit)
+ continue;
+ unit->die = die;
+ unit->id = PCI_SLOT(dev->devfn) - PCI_SLOT(devfn);
+ unit->addr = pci_domain_nr(dev->bus) << UNCORE_DISCOVERY_PCI_DOMAIN_OFFSET |
+ dev->bus->number << UNCORE_DISCOVERY_PCI_BUS_OFFSET |
+ devfn << UNCORE_DISCOVERY_PCI_DEVFN_OFFSET |
+ type->box_ctl;
+
+ unit->pmu_idx = unit->id;
+
+ uncore_find_add_unit(unit, root, NULL);
}
- type->box_ctls = ctls;
+ type->boxes = root;
}
int spr_uncore_pci_init(void)
@@ -6622,7 +6709,7 @@ static struct intel_uncore_type gnr_uncore_b2cmi = {
};
static struct intel_uncore_type gnr_uncore_b2cxl = {
- SPR_UNCORE_MMIO_COMMON_FORMAT(),
+ SPR_UNCORE_MMIO_OFFS8_COMMON_FORMAT(),
.name = "b2cxl",
};
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 9e237b30f017..45b1866ff051 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -2,7 +2,7 @@
#include <linux/perf_event.h>
#include <linux/sysfs.h>
#include <linux/nospec.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include "probe.h"
enum perf_msr_id {
@@ -43,75 +43,75 @@ static bool test_intel(int idx, void *data)
boot_cpu_data.x86 != 6)
return false;
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_NEHALEM_G:
- case INTEL_FAM6_NEHALEM_EP:
- case INTEL_FAM6_NEHALEM_EX:
-
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_WESTMERE_EP:
- case INTEL_FAM6_WESTMERE_EX:
-
- case INTEL_FAM6_SANDYBRIDGE:
- case INTEL_FAM6_SANDYBRIDGE_X:
-
- case INTEL_FAM6_IVYBRIDGE:
- case INTEL_FAM6_IVYBRIDGE_X:
-
- case INTEL_FAM6_HASWELL:
- case INTEL_FAM6_HASWELL_X:
- case INTEL_FAM6_HASWELL_L:
- case INTEL_FAM6_HASWELL_G:
-
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_D:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_BROADWELL_X:
- case INTEL_FAM6_SAPPHIRERAPIDS_X:
- case INTEL_FAM6_EMERALDRAPIDS_X:
- case INTEL_FAM6_GRANITERAPIDS_X:
- case INTEL_FAM6_GRANITERAPIDS_D:
-
- case INTEL_FAM6_ATOM_SILVERMONT:
- case INTEL_FAM6_ATOM_SILVERMONT_D:
- case INTEL_FAM6_ATOM_AIRMONT:
-
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_D:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
- case INTEL_FAM6_ATOM_TREMONT_D:
- case INTEL_FAM6_ATOM_TREMONT:
- case INTEL_FAM6_ATOM_TREMONT_L:
-
- case INTEL_FAM6_XEON_PHI_KNL:
- case INTEL_FAM6_XEON_PHI_KNM:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_G:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
+
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_WESTMERE_EX:
+
+ case INTEL_SANDYBRIDGE:
+ case INTEL_SANDYBRIDGE_X:
+
+ case INTEL_IVYBRIDGE:
+ case INTEL_IVYBRIDGE_X:
+
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_X:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_D:
+ case INTEL_BROADWELL_G:
+ case INTEL_BROADWELL_X:
+ case INTEL_SAPPHIRERAPIDS_X:
+ case INTEL_EMERALDRAPIDS_X:
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_GRANITERAPIDS_D:
+
+ case INTEL_ATOM_SILVERMONT:
+ case INTEL_ATOM_SILVERMONT_D:
+ case INTEL_ATOM_AIRMONT:
+
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_D:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+ case INTEL_ATOM_TREMONT_D:
+ case INTEL_ATOM_TREMONT:
+ case INTEL_ATOM_TREMONT_L:
+
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
if (idx == PERF_MSR_SMI)
return true;
break;
- case INTEL_FAM6_SKYLAKE_L:
- case INTEL_FAM6_SKYLAKE:
- case INTEL_FAM6_SKYLAKE_X:
- case INTEL_FAM6_KABYLAKE_L:
- case INTEL_FAM6_KABYLAKE:
- case INTEL_FAM6_COMETLAKE_L:
- case INTEL_FAM6_COMETLAKE:
- case INTEL_FAM6_ICELAKE_L:
- case INTEL_FAM6_ICELAKE:
- case INTEL_FAM6_ICELAKE_X:
- case INTEL_FAM6_ICELAKE_D:
- case INTEL_FAM6_TIGERLAKE_L:
- case INTEL_FAM6_TIGERLAKE:
- case INTEL_FAM6_ROCKETLAKE:
- case INTEL_FAM6_ALDERLAKE:
- case INTEL_FAM6_ALDERLAKE_L:
- case INTEL_FAM6_ATOM_GRACEMONT:
- case INTEL_FAM6_RAPTORLAKE:
- case INTEL_FAM6_RAPTORLAKE_P:
- case INTEL_FAM6_RAPTORLAKE_S:
- case INTEL_FAM6_METEORLAKE:
- case INTEL_FAM6_METEORLAKE_L:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_SKYLAKE_X:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
+ case INTEL_COMETLAKE_L:
+ case INTEL_COMETLAKE:
+ case INTEL_ICELAKE_L:
+ case INTEL_ICELAKE:
+ case INTEL_ICELAKE_X:
+ case INTEL_ICELAKE_D:
+ case INTEL_TIGERLAKE_L:
+ case INTEL_TIGERLAKE:
+ case INTEL_ROCKETLAKE:
+ case INTEL_ALDERLAKE:
+ case INTEL_ALDERLAKE_L:
+ case INTEL_ATOM_GRACEMONT:
+ case INTEL_RAPTORLAKE:
+ case INTEL_RAPTORLAKE_P:
+ case INTEL_RAPTORLAKE_S:
+ case INTEL_METEORLAKE:
+ case INTEL_METEORLAKE_L:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index fb56518356ec..ac1182141bf6 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -476,6 +476,14 @@ struct cpu_hw_events {
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID)
+#define INTEL_HYBRID_LDLAT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID|PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_HYBRID_STLAT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID|PERF_X86_EVENT_PEBS_ST_HSW)
+
/* Event constraint, but match on all event flags too. */
#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS)
@@ -655,8 +663,10 @@ enum {
x86_lbr_exclusive_max,
};
-#define PERF_PEBS_DATA_SOURCE_MAX 0x10
+#define PERF_PEBS_DATA_SOURCE_MAX 0x100
#define PERF_PEBS_DATA_SOURCE_MASK (PERF_PEBS_DATA_SOURCE_MAX - 1)
+#define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10
+#define PERF_PEBS_DATA_SOURCE_GRT_MASK (PERF_PEBS_DATA_SOURCE_GRT_MAX - 1)
enum hybrid_cpu_type {
HYBRID_INTEL_NONE,
@@ -684,9 +694,16 @@ struct x86_hybrid_pmu {
cpumask_t supported_cpus;
union perf_capabilities intel_cap;
u64 intel_ctrl;
- int max_pebs_events;
- int num_counters;
- int num_counters_fixed;
+ u64 pebs_events_mask;
+ u64 config_mask;
+ union {
+ u64 cntr_mask64;
+ unsigned long cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ };
+ union {
+ u64 fixed_cntr_mask64;
+ unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ };
struct event_constraint unconstrained;
u64 hw_cache_event_ids
@@ -770,12 +787,20 @@ struct x86_pmu {
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
unsigned perfctr;
+ unsigned fixedctr;
int (*addr_offset)(int index, bool eventsel);
int (*rdpmc_index)(int index);
u64 (*event_map)(int);
int max_events;
- int num_counters;
- int num_counters_fixed;
+ u64 config_mask;
+ union {
+ u64 cntr_mask64;
+ unsigned long cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ };
+ union {
+ u64 fixed_cntr_mask64;
+ unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ };
int cntval_bits;
u64 cntval_mask;
union {
@@ -852,7 +877,7 @@ struct x86_pmu {
pebs_ept :1;
int pebs_record_size;
int pebs_buffer_size;
- int max_pebs_events;
+ u64 pebs_events_mask;
void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data);
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
@@ -1120,13 +1145,19 @@ static inline unsigned int x86_pmu_event_addr(int index)
x86_pmu.addr_offset(index, false) : index);
}
+static inline unsigned int x86_pmu_fixed_ctr_addr(int index)
+{
+ return x86_pmu.fixedctr + (x86_pmu.addr_offset ?
+ x86_pmu.addr_offset(index, false) : index);
+}
+
static inline int x86_pmu_rdpmc_index(int index)
{
return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
}
-bool check_hw_exists(struct pmu *pmu, int num_counters,
- int num_counters_fixed);
+bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask,
+ unsigned long *fixed_cntr_mask);
int x86_add_exclusive(unsigned int what);
@@ -1197,8 +1228,32 @@ void x86_pmu_enable_event(struct perf_event *event);
int x86_pmu_handle_irq(struct pt_regs *regs);
-void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
- u64 intel_ctrl);
+void x86_pmu_show_pmu_cap(struct pmu *pmu);
+
+static inline int x86_pmu_num_counters(struct pmu *pmu)
+{
+ return hweight64(hybrid(pmu, cntr_mask64));
+}
+
+static inline int x86_pmu_max_num_counters(struct pmu *pmu)
+{
+ return fls64(hybrid(pmu, cntr_mask64));
+}
+
+static inline int x86_pmu_num_counters_fixed(struct pmu *pmu)
+{
+ return hweight64(hybrid(pmu, fixed_cntr_mask64));
+}
+
+static inline int x86_pmu_max_num_counters_fixed(struct pmu *pmu)
+{
+ return fls64(hybrid(pmu, fixed_cntr_mask64));
+}
+
+static inline u64 x86_pmu_get_event_config(struct perf_event *event)
+{
+ return event->attr.config & hybrid(event->pmu, config_mask);
+}
extern struct event_constraint emptyconstraint;
@@ -1329,6 +1384,19 @@ void amd_pmu_lbr_enable_all(void);
void amd_pmu_lbr_disable_all(void);
int amd_pmu_lbr_hw_config(struct perf_event *event);
+static __always_inline void __amd_pmu_lbr_disable(void)
+{
+ u64 dbg_ctl, dbg_extn_cfg;
+
+ rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
+ wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
+
+ if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) {
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ }
+}
+
#ifdef CONFIG_PERF_EVENTS_AMD_BRS
#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */
@@ -1504,9 +1572,11 @@ void intel_pmu_disable_bts(void);
int intel_pmu_drain_bts_buffer(void);
-u64 adl_latency_data_small(struct perf_event *event, u64 status);
+u64 grt_latency_data(struct perf_event *event, u64 status);
+
+u64 cmt_latency_data(struct perf_event *event, u64 status);
-u64 mtl_latency_data_small(struct perf_event *event, u64 status);
+u64 lnl_latency_data(struct perf_event *event, u64 status);
extern struct event_constraint intel_core2_pebs_event_constraints[];
@@ -1538,6 +1608,8 @@ extern struct event_constraint intel_icl_pebs_event_constraints[];
extern struct event_constraint intel_glc_pebs_event_constraints[];
+extern struct event_constraint intel_lnc_pebs_event_constraints[];
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_add(struct perf_event *event);
@@ -1627,6 +1699,8 @@ void intel_pmu_pebs_data_source_mtl(void);
void intel_pmu_pebs_data_source_cmt(void);
+void intel_pmu_pebs_data_source_lnl(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
@@ -1648,6 +1722,17 @@ static inline int is_ht_workaround_enabled(void)
return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
}
+static inline u64 intel_pmu_pebs_mask(u64 cntr_mask)
+{
+ return MAX_PEBS_EVENTS_MASK & cntr_mask;
+}
+
+static inline int intel_pmu_max_num_pebs(struct pmu *pmu)
+{
+ static_assert(MAX_PEBS_EVENTS == 32);
+ return fls((u32)hybrid(pmu, pebs_events_mask));
+}
+
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index fb2b1961e5a3..a481a939862e 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -64,6 +64,7 @@
#include "perf_event.h"
#include "probe.h"
+MODULE_DESCRIPTION("Support Intel/AMD RAPL energy consumption counters");
MODULE_LICENSE("GPL");
/*
@@ -102,6 +103,19 @@ static struct perf_pmu_events_attr event_attr_##v = { \
.event_str = str, \
};
+/*
+ * RAPL Package energy counter scope:
+ * 1. AMD/HYGON platforms have a per-PKG package energy counter
+ * 2. For Intel platforms
+ * 2.1. CLX-AP is multi-die and its RAPL MSRs are die-scope
+ * 2.2. Other Intel platforms are single die systems so the scope can be
+ * considered as either pkg-scope or die-scope, and we are considering
+ * them as die-scope.
+ */
+#define rapl_pmu_is_pkg_scope() \
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+
struct rapl_pmu {
raw_spinlock_t lock;
int n_active;
@@ -114,8 +128,8 @@ struct rapl_pmu {
struct rapl_pmus {
struct pmu pmu;
- unsigned int maxdie;
- struct rapl_pmu *pmus[] __counted_by(maxdie);
+ unsigned int nr_rapl_pmu;
+ struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu);
};
enum rapl_unit_quirk {
@@ -139,15 +153,31 @@ static unsigned int rapl_cntr_mask;
static u64 rapl_timer_ms;
static struct perf_msr *rapl_msrs;
+/*
+ * Helper functions to get the correct topology macros according to the
+ * RAPL PMU scope.
+ */
+static inline unsigned int get_rapl_pmu_idx(int cpu)
+{
+ return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
+ topology_logical_die_id(cpu);
+}
+
+static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
+{
+ return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
+ topology_die_cpumask(cpu);
+}
+
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
{
- unsigned int dieid = topology_logical_die_id(cpu);
+ unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
/*
* The unsigned check also catches the '-1' return value for non
* existent mappings in the topology map.
*/
- return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
+ return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
}
static inline u64 rapl_read_counter(struct perf_event *event)
@@ -551,7 +581,7 @@ static int rapl_cpu_offline(unsigned int cpu)
pmu->cpu = -1;
/* Find a new cpu to collect rapl events */
- target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
+ target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu);
/* Migrate rapl events to the new target */
if (target < nr_cpu_ids) {
@@ -564,6 +594,11 @@ static int rapl_cpu_offline(unsigned int cpu)
static int rapl_cpu_online(unsigned int cpu)
{
+ s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu);
+ if (rapl_pmu_idx < 0) {
+ pr_err("topology_logical_(package/die)_id() returned a negative value");
+ return -EINVAL;
+ }
struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
int target;
@@ -578,14 +613,14 @@ static int rapl_cpu_online(unsigned int cpu)
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
rapl_hrtimer_init(pmu);
- rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
+ rapl_pmus->pmus[rapl_pmu_idx] = pmu;
}
/*
* Check if there is an online cpu in the package which collects rapl
* events already.
*/
- target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
+ target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu));
if (target < nr_cpu_ids)
return 0;
@@ -658,7 +693,7 @@ static void cleanup_rapl_pmus(void)
{
int i;
- for (i = 0; i < rapl_pmus->maxdie; i++)
+ for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
kfree(rapl_pmus->pmus[i]);
kfree(rapl_pmus);
}
@@ -674,15 +709,16 @@ static const struct attribute_group *rapl_attr_update[] = {
static int __init init_rapl_pmus(void)
{
- int maxdie = topology_max_packages() * topology_max_dies_per_package();
- size_t size;
+ int nr_rapl_pmu = topology_max_packages();
+
+ if (!rapl_pmu_is_pkg_scope())
+ nr_rapl_pmu *= topology_max_dies_per_package();
- size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
- rapl_pmus = kzalloc(size, GFP_KERNEL);
+ rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
if (!rapl_pmus)
return -ENOMEM;
- rapl_pmus->maxdie = maxdie;
+ rapl_pmus->nr_rapl_pmu = nr_rapl_pmu;
rapl_pmus->pmu.attr_groups = rapl_attr_groups;
rapl_pmus->pmu.attr_update = rapl_attr_update;
rapl_pmus->pmu.task_ctx_nr = perf_invalid_context;
@@ -766,48 +802,51 @@ static struct rapl_model model_amd_hygon = {
};
static const struct x86_cpu_id rapl_model_match[] __initconst = {
- X86_MATCH_FEATURE(X86_FEATURE_RAPL, &model_amd_hygon),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &model_snb),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &model_snbep),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &model_snb),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &model_snbep),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &model_hsw),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &model_hsx),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &model_hsw),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &model_hsw),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &model_hsw),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &model_hsw),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &model_hsx),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &model_hsx),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &model_knl),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &model_knl),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &model_hsx),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &model_hsw),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &model_hsw),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &model_hsw),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &model_hsx),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &model_spr),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl),
+ X86_MATCH_FEATURE(X86_FEATURE_RAPL, &model_amd_hygon),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE, &model_snb),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &model_snbep),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE, &model_snb),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &model_snbep),
+ X86_MATCH_VFM(INTEL_HASWELL, &model_hsw),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &model_hsx),
+ X86_MATCH_VFM(INTEL_HASWELL_L, &model_hsw),
+ X86_MATCH_VFM(INTEL_HASWELL_G, &model_hsw),
+ X86_MATCH_VFM(INTEL_BROADWELL, &model_hsw),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, &model_hsw),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &model_hsx),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &model_hsx),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &model_knl),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &model_knl),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_SKYLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &model_hsx),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_KABYLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_CANNONLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &model_hsw),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &model_hsw),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &model_hsw),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_ICELAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &model_hsx),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &model_hsx),
+ X86_MATCH_VFM(INTEL_COMETLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_COMETLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &model_skl),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &model_spr),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &model_spr),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &model_skl),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &model_skl),
+ X86_MATCH_VFM(INTEL_METEORLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_skl),
+ X86_MATCH_VFM(INTEL_ARROWLAKE_H, &model_skl),
+ X86_MATCH_VFM(INTEL_ARROWLAKE, &model_skl),
+ X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_skl),
{},
};
MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
index 3e9acdaeed1e..2fd9b0cf9a5e 100644
--- a/arch/x86/events/zhaoxin/core.c
+++ b/arch/x86/events/zhaoxin/core.c
@@ -530,13 +530,13 @@ __init int zhaoxin_pmu_init(void)
pr_info("Version check pass!\n");
x86_pmu.version = version;
- x86_pmu.num_counters = eax.split.num_counters;
+ x86_pmu.cntr_mask64 = GENMASK_ULL(eax.split.num_counters - 1, 0);
x86_pmu.cntval_bits = eax.split.bit_width;
x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
x86_pmu.events_maskl = ebx.full;
x86_pmu.events_mask_len = eax.split.mask_length;
- x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+ x86_pmu.fixed_cntr_mask64 = GENMASK_ULL(edx.split.num_counters_fixed - 1, 0);
x86_add_quirk(zhaoxin_arch_events_quirk);
switch (boot_cpu_data.x86) {
@@ -604,13 +604,13 @@ __init int zhaoxin_pmu_init(void)
return -ENODEV;
}
- x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1;
- x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+ x86_pmu.intel_ctrl = x86_pmu.cntr_mask64;
+ x86_pmu.intel_ctrl |= x86_pmu.fixed_cntr_mask64 << INTEL_PMC_IDX_FIXED;
if (x86_pmu.event_constraints) {
for_each_event_constraint(c, x86_pmu.event_constraints) {
- c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
- c->weight += x86_pmu.num_counters;
+ c->idxmsk64 |= x86_pmu.cntr_mask64;
+ c->weight += x86_pmu_num_counters(NULL);
}
}
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 17a71e92a343..95eada2994e1 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -35,7 +35,6 @@
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
-int hyperv_init_cpuhp;
u64 hv_current_partition_id = ~0ull;
EXPORT_SYMBOL_GPL(hv_current_partition_id);
@@ -607,8 +606,6 @@ skip_hypercall_pg_init:
register_syscore_ops(&hv_syscore_ops);
- hyperv_init_cpuhp = cpuhp;
-
if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID)
hv_get_partition_id();
@@ -637,7 +634,7 @@ skip_hypercall_pg_init:
clean_guest_os_id:
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
- cpuhp_remove_state(cpuhp);
+ cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE);
free_ghcb_page:
free_percpu(hv_ghcb_pg);
free_vp_assist_page:
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 5c7de79423b8..04775346369c 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -34,7 +34,6 @@ void __init hv_vtl_init_platform(void)
/* Avoid searching for BIOS MP tables */
x86_init.mpparse.find_mptable = x86_init_noop;
x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
- x86_init.mpparse.parse_smp_cfg = x86_init_noop;
x86_platform.get_wallclock = get_rtc_noop;
x86_platform.set_wallclock = set_rtc_noop;
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 768d73de0d09..60fc3ed72830 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -321,9 +321,9 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
vmsa->efer = native_read_msr(MSR_EFER);
- asm volatile("movq %%cr4, %%rax;" : "=a" (vmsa->cr4));
- asm volatile("movq %%cr3, %%rax;" : "=a" (vmsa->cr3));
- asm volatile("movq %%cr0, %%rax;" : "=a" (vmsa->cr0));
+ vmsa->cr4 = native_read_cr4();
+ vmsa->cr3 = __native_read_cr3();
+ vmsa->cr0 = native_read_cr0();
vmsa->xcr0 = 1;
vmsa->g_pat = HV_AP_INIT_GPAT_DEFAULT;
@@ -523,9 +523,9 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
* transition is complete, hv_vtom_set_host_visibility() marks the pages
* as "present" again.
*/
-static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc)
+static int hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc)
{
- return !set_memory_np(kbuffer, pagecount);
+ return set_memory_np(kbuffer, pagecount);
}
/*
@@ -536,20 +536,19 @@ static bool hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc
* with host. This function works as wrap of hv_mark_gpa_visibility()
* with memory base and size.
*/
-static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc)
+static int hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc)
{
enum hv_mem_host_visibility visibility = enc ?
VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE;
u64 *pfn_array;
phys_addr_t paddr;
+ int i, pfn, err;
void *vaddr;
int ret = 0;
- bool result = true;
- int i, pfn;
pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
if (!pfn_array) {
- result = false;
+ ret = -ENOMEM;
goto err_set_memory_p;
}
@@ -568,10 +567,8 @@ static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bo
if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) {
ret = hv_mark_gpa_visibility(pfn, pfn_array,
visibility);
- if (ret) {
- result = false;
+ if (ret)
goto err_free_pfn_array;
- }
pfn = 0;
}
}
@@ -586,10 +583,11 @@ err_set_memory_p:
* order to avoid leaving the memory range in a "broken" state. Setting
* the PRESENT bits shouldn't fail, but return an error if it does.
*/
- if (set_memory_p(kbuffer, pagecount))
- result = false;
+ err = set_memory_p(kbuffer, pagecount);
+ if (err && !ret)
+ ret = err;
- return result;
+ return ret;
}
static bool hv_vtom_tlb_flush_required(bool private)
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index a192bdea69e2..6c23d1661b17 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -11,3 +11,4 @@ generated-y += xen-hypercalls.h
generic-y += early_ioremap.h
generic-y += mcs_spinlock.h
+generic-y += mmzone.h
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index f896eed4516c..5ab1a4598d00 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -56,6 +56,8 @@ static inline void disable_acpi(void)
extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
+extern int acpi_blacklisted(void);
+
static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
static inline void acpi_disable_pci(void)
{
@@ -76,6 +78,13 @@ static inline bool acpi_skip_set_wakeup_address(void)
#define acpi_skip_set_wakeup_address acpi_skip_set_wakeup_address
+union acpi_subtable_headers;
+
+int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
+ const unsigned long end);
+
+void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa);
+
/*
* Check if the CPU can handle C2 and deeper
*/
@@ -165,6 +174,14 @@ void acpi_generic_reduced_hw_init(void);
void x86_default_set_root_pointer(u64 addr);
u64 x86_default_get_root_pointer(void);
+#ifdef CONFIG_XEN_PV
+/* A Xen PV domain needs a special acpi_os_ioremap() handling. */
+extern void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys,
+ acpi_size size);
+void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size);
+#define acpi_os_ioremap acpi_os_ioremap
+#endif
+
#else /* !CONFIG_ACPI */
#define acpi_lapic 0
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 67b68d0d17d1..ca9ae606aab9 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -156,102 +156,50 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define ALT_CALL_INSTR "call BUG_func"
-#define b_replacement(num) "664"#num
-#define e_replacement(num) "665"#num
+#define alt_slen "772b-771b"
+#define alt_total_slen "773b-771b"
+#define alt_rlen "775f-774f"
-#define alt_end_marker "663"
-#define alt_slen "662b-661b"
-#define alt_total_slen alt_end_marker"b-661b"
-#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
-
-#define OLDINSTR(oldinstr, num) \
- "# ALT: oldnstr\n" \
- "661:\n\t" oldinstr "\n662:\n" \
+#define OLDINSTR(oldinstr) \
+ "# ALT: oldinstr\n" \
+ "771:\n\t" oldinstr "\n772:\n" \
"# ALT: padding\n" \
- ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \
- "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" \
- alt_end_marker ":\n"
-
-/*
- * gas compatible max based on the idea from:
- * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
- *
- * The additional "-" is needed because gas uses a "true" value of -1.
- */
-#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))"
-
-/*
- * Pad the second replacement alternative with additional NOPs if it is
- * additionally longer than the first replacement alternative.
- */
-#define OLDINSTR_2(oldinstr, num1, num2) \
- "# ALT: oldinstr2\n" \
- "661:\n\t" oldinstr "\n662:\n" \
- "# ALT: padding2\n" \
- ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \
- "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \
- alt_end_marker ":\n"
-
-#define OLDINSTR_3(oldinsn, n1, n2, n3) \
- "# ALT: oldinstr3\n" \
- "661:\n\t" oldinsn "\n662:\n" \
- "# ALT: padding3\n" \
- ".skip -((" alt_max_short(alt_max_short(alt_rlen(n1), alt_rlen(n2)), alt_rlen(n3)) \
- " - (" alt_slen ")) > 0) * " \
- "(" alt_max_short(alt_max_short(alt_rlen(n1), alt_rlen(n2)), alt_rlen(n3)) \
- " - (" alt_slen ")), 0x90\n" \
- alt_end_marker ":\n"
-
-#define ALTINSTR_ENTRY(ft_flags, num) \
- " .long 661b - .\n" /* label */ \
- " .long " b_replacement(num)"f - .\n" /* new instruction */ \
+ ".skip -(((" alt_rlen ")-(" alt_slen ")) > 0) * " \
+ "((" alt_rlen ")-(" alt_slen ")),0x90\n" \
+ "773:\n"
+
+#define ALTINSTR_ENTRY(ft_flags) \
+ ".pushsection .altinstructions,\"a\"\n" \
+ " .long 771b - .\n" /* label */ \
+ " .long 774f - .\n" /* new instruction */ \
" .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \
" .byte " alt_total_slen "\n" /* source len */ \
- " .byte " alt_rlen(num) "\n" /* replacement len */
+ " .byte " alt_rlen "\n" /* replacement len */ \
+ ".popsection\n"
-#define ALTINSTR_REPLACEMENT(newinstr, num) /* replacement */ \
- "# ALT: replacement " #num "\n" \
- b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n"
+#define ALTINSTR_REPLACEMENT(newinstr) /* replacement */ \
+ ".pushsection .altinstr_replacement, \"ax\"\n" \
+ "# ALT: replacement\n" \
+ "774:\n\t" newinstr "\n775:\n" \
+ ".popsection\n"
/* alternative assembly primitive: */
#define ALTERNATIVE(oldinstr, newinstr, ft_flags) \
- OLDINSTR(oldinstr, 1) \
- ".pushsection .altinstructions,\"a\"\n" \
- ALTINSTR_ENTRY(ft_flags, 1) \
- ".popsection\n" \
- ".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinstr, 1) \
- ".popsection\n"
+ OLDINSTR(oldinstr) \
+ ALTINSTR_ENTRY(ft_flags) \
+ ALTINSTR_REPLACEMENT(newinstr)
#define ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \
- OLDINSTR_2(oldinstr, 1, 2) \
- ".pushsection .altinstructions,\"a\"\n" \
- ALTINSTR_ENTRY(ft_flags1, 1) \
- ALTINSTR_ENTRY(ft_flags2, 2) \
- ".popsection\n" \
- ".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinstr1, 1) \
- ALTINSTR_REPLACEMENT(newinstr2, 2) \
- ".popsection\n"
+ ALTERNATIVE(ALTERNATIVE(oldinstr, newinstr1, ft_flags1), newinstr2, ft_flags2)
/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
#define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
- ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
- newinstr_yes, ft_flags)
-
-#define ALTERNATIVE_3(oldinsn, newinsn1, ft_flags1, newinsn2, ft_flags2, \
- newinsn3, ft_flags3) \
- OLDINSTR_3(oldinsn, 1, 2, 3) \
- ".pushsection .altinstructions,\"a\"\n" \
- ALTINSTR_ENTRY(ft_flags1, 1) \
- ALTINSTR_ENTRY(ft_flags2, 2) \
- ALTINSTR_ENTRY(ft_flags3, 3) \
- ".popsection\n" \
- ".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinsn1, 1) \
- ALTINSTR_REPLACEMENT(newinsn2, 2) \
- ALTINSTR_REPLACEMENT(newinsn3, 3) \
- ".popsection\n"
+ ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, newinstr_yes, ft_flags)
+
+#define ALTERNATIVE_3(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2, \
+ newinstr3, ft_flags3) \
+ ALTERNATIVE(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2), \
+ newinstr3, ft_flags3)
/*
* Alternative instructions for different CPU types or capabilities.
@@ -266,14 +214,11 @@ static inline int alternatives_text_reserved(void *start, void *end)
* without volatile and memory clobber.
*/
#define alternative(oldinstr, newinstr, ft_flags) \
- asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) : : : "memory")
+ asm_inline volatile(ALTERNATIVE(oldinstr, newinstr, ft_flags) : : : "memory")
#define alternative_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \
asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) ::: "memory")
-#define alternative_ternary(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
- asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) ::: "memory")
-
/*
* Alternative inline assembly with input.
*
@@ -283,32 +228,28 @@ static inline int alternatives_text_reserved(void *start, void *end)
* Leaving an unused argument 0 to keep API compatibility.
*/
#define alternative_input(oldinstr, newinstr, ft_flags, input...) \
- asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) \
- : : "i" (0), ## input)
-
-/*
- * This is similar to alternative_input. But it has two features and
- * respective instructions.
- *
- * If CPU has feature2, newinstr2 is used.
- * Otherwise, if CPU has feature1, newinstr1 is used.
- * Otherwise, oldinstr is used.
- */
-#define alternative_input_2(oldinstr, newinstr1, ft_flags1, newinstr2, \
- ft_flags2, input...) \
- asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, \
- newinstr2, ft_flags2) \
+ asm_inline volatile(ALTERNATIVE(oldinstr, newinstr, ft_flags) \
: : "i" (0), ## input)
/* Like alternative_input, but with a single output argument */
#define alternative_io(oldinstr, newinstr, ft_flags, output, input...) \
- asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) \
+ asm_inline volatile(ALTERNATIVE(oldinstr, newinstr, ft_flags) \
: output : "i" (0), ## input)
-/* Like alternative_io, but for replacing a direct call with another one. */
-#define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \
- asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", ft_flags) \
- : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
+/*
+ * Like alternative_io, but for replacing a direct call with another one.
+ *
+ * Use the %c operand modifier which is the generic way to print a bare
+ * constant expression with all syntax-specific punctuation omitted. %P
+ * is the x86-specific variant which can handle constants too, for
+ * historical reasons, but it should be used primarily for PIC
+ * references: i.e., if used for a function, it would add the PLT
+ * suffix.
+ */
+#define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \
+ asm_inline volatile(ALTERNATIVE("call %c[old]", "call %c[new]", ft_flags) \
+ : ALT_OUTPUT_SP(output) \
+ : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
/*
* Like alternative_call, but there are two features and respective functions.
@@ -316,12 +257,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
* Otherwise, if CPU has feature1, function1 is used.
* Otherwise, old function is used.
*/
-#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \
- output, input...) \
- asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", ft_flags1,\
- "call %P[new2]", ft_flags2) \
- : output, ASM_CALL_CONSTRAINT \
- : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
+#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \
+ output, input...) \
+ asm_inline volatile(ALTERNATIVE_2("call %c[old]", "call %c[new1]", ft_flags1, \
+ "call %c[new2]", ft_flags2) \
+ : ALT_OUTPUT_SP(output) \
+ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
[new2] "i" (newfunc2), ## input)
/*
@@ -336,6 +277,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
*/
#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr
+#define ALT_OUTPUT_SP(...) ASM_CALL_CONSTRAINT, ## __VA_ARGS__
+
/* Macro for creating assembler functions avoiding any C magic. */
#define DEFINE_ASM_FUNC(func, instr, sec) \
asm (".pushsection " #sec ", \"ax\"\n" \
@@ -402,22 +345,23 @@ void nop_func(void);
* @newinstr. ".skip" directive takes care of proper instruction padding
* in case @newinstr is longer than @oldinstr.
*/
-.macro ALTERNATIVE oldinstr, newinstr, ft_flags
-140:
- \oldinstr
-141:
- .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
-142:
-
- .pushsection .altinstructions,"a"
- altinstr_entry 140b,143f,\ft_flags,142b-140b,144f-143f
- .popsection
+#define __ALTERNATIVE(oldinst, newinst, flag) \
+740: \
+ oldinst ; \
+741: \
+ .skip -(((744f-743f)-(741b-740b)) > 0) * ((744f-743f)-(741b-740b)),0x90 ;\
+742: \
+ .pushsection .altinstructions,"a" ; \
+ altinstr_entry 740b,743f,flag,742b-740b,744f-743f ; \
+ .popsection ; \
+ .pushsection .altinstr_replacement,"ax" ; \
+743: \
+ newinst ; \
+744: \
+ .popsection ;
- .pushsection .altinstr_replacement,"ax"
-143:
- \newinstr
-144:
- .popsection
+.macro ALTERNATIVE oldinstr, newinstr, ft_flags
+ __ALTERNATIVE(\oldinstr, \newinstr, \ft_flags)
.endm
#define old_len 141b-140b
@@ -426,65 +370,18 @@ void nop_func(void);
#define new_len3 146f-145f
/*
- * gas compatible max based on the idea from:
- * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
- *
- * The additional "-" is needed because gas uses a "true" value of -1.
- */
-#define alt_max_2(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
-#define alt_max_3(a, b, c) (alt_max_2(alt_max_2(a, b), c))
-
-
-/*
* Same as ALTERNATIVE macro above but for two alternatives. If CPU
* has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
* @feature2, it replaces @oldinstr with @feature2.
*/
.macro ALTERNATIVE_2 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2
-140:
- \oldinstr
-141:
- .skip -((alt_max_2(new_len1, new_len2) - (old_len)) > 0) * \
- (alt_max_2(new_len1, new_len2) - (old_len)),0x90
-142:
-
- .pushsection .altinstructions,"a"
- altinstr_entry 140b,143f,\ft_flags1,142b-140b,144f-143f
- altinstr_entry 140b,144f,\ft_flags2,142b-140b,145f-144f
- .popsection
-
- .pushsection .altinstr_replacement,"ax"
-143:
- \newinstr1
-144:
- \newinstr2
-145:
- .popsection
+ __ALTERNATIVE(__ALTERNATIVE(\oldinstr, \newinstr1, \ft_flags1),
+ \newinstr2, \ft_flags2)
.endm
.macro ALTERNATIVE_3 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2, newinstr3, ft_flags3
-140:
- \oldinstr
-141:
- .skip -((alt_max_3(new_len1, new_len2, new_len3) - (old_len)) > 0) * \
- (alt_max_3(new_len1, new_len2, new_len3) - (old_len)),0x90
-142:
-
- .pushsection .altinstructions,"a"
- altinstr_entry 140b,143f,\ft_flags1,142b-140b,144f-143f
- altinstr_entry 140b,144f,\ft_flags2,142b-140b,145f-144f
- altinstr_entry 140b,145f,\ft_flags3,142b-140b,146f-145f
- .popsection
-
- .pushsection .altinstr_replacement,"ax"
-143:
- \newinstr1
-144:
- \newinstr2
-145:
- \newinstr3
-146:
- .popsection
+ __ALTERNATIVE(ALTERNATIVE_2(\oldinstr, \newinstr1, \ft_flags1, \newinstr2, \ft_flags2),
+ \newinstr3, \ft_flags3)
.endm
/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 5c37944c8a5e..6f3b6aef47ba 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -21,8 +21,8 @@ extern int amd_numa_init(void);
extern int amd_get_subcaches(int);
extern int amd_set_subcaches(int, unsigned long);
-extern int amd_smn_read(u16 node, u32 address, u32 *value);
-extern int amd_smn_write(u16 node, u32 address, u32 value);
+int __must_check amd_smn_read(u16 node, u32 address, u32 *value);
+int __must_check amd_smn_write(u16 node, u32 address, u32 value);
struct amd_l3_cache {
unsigned indices;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index e6ab0cf15ed5..f21ff1932699 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -14,9 +14,15 @@
#include <asm/msr.h>
#include <asm/hardirq.h>
#include <asm/io.h>
+#include <asm/posted_intr.h>
#define ARCH_APICTIMER_STOPS_ON_C3 1
+/* Macros for apic_extnmi which controls external NMI masking */
+#define APIC_EXTNMI_BSP 0 /* Default */
+#define APIC_EXTNMI_ALL 1
+#define APIC_EXTNMI_NONE 2
+
/*
* Debugging macros
*/
@@ -24,22 +30,22 @@
#define APIC_VERBOSE 1
#define APIC_DEBUG 2
-/* Macros for apic_extnmi which controls external NMI masking */
-#define APIC_EXTNMI_BSP 0 /* Default */
-#define APIC_EXTNMI_ALL 1
-#define APIC_EXTNMI_NONE 2
-
/*
- * Define the default level of output to be very little
- * This can be turned up by using apic=verbose for more
- * information and apic=debug for _lots_ of information.
- * apic_verbosity is defined in apic.c
+ * Define the default level of output to be very little This can be turned
+ * up by using apic=verbose for more information and apic=debug for _lots_
+ * of information. apic_verbosity is defined in apic.c
*/
-#define apic_printk(v, s, a...) do { \
- if ((v) <= apic_verbosity) \
- printk(s, ##a); \
- } while (0)
-
+#define apic_printk(v, s, a...) \
+do { \
+ if ((v) <= apic_verbosity) \
+ printk(s, ##a); \
+} while (0)
+
+#define apic_pr_verbose(s, a...) apic_printk(APIC_VERBOSE, KERN_INFO s, ##a)
+#define apic_pr_debug(s, a...) apic_printk(APIC_DEBUG, KERN_DEBUG s, ##a)
+#define apic_pr_debug_cont(s, a...) apic_printk(APIC_DEBUG, KERN_CONT s, ##a)
+/* Unconditional debug prints for code which is guarded by apic_verbosity already */
+#define apic_dbg(s, a...) printk(KERN_DEBUG s, ##a)
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
extern void x86_32_probe_apic(void);
@@ -92,7 +98,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
{
volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
- alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
+ alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
ASM_OUTPUT2("=r" (v), "=m" (*addr)),
ASM_OUTPUT2("0" (v), "m" (*addr)));
}
@@ -121,8 +127,6 @@ static inline bool apic_is_x2apic_enabled(void)
extern void enable_IR_x2apic(void);
-extern int get_physical_broadcast(void);
-
extern int lapic_get_maxlvt(void);
extern void clear_local_APIC(void);
extern void disconnect_bsp_APIC(int virt_wire_setup);
@@ -344,20 +348,12 @@ extern struct apic *apic;
* APIC drivers are probed based on how they are listed in the .apicdrivers
* section. So the order is important and enforced by the ordering
* of different apic driver files in the Makefile.
- *
- * For the files having two apic drivers, we use apic_drivers()
- * to enforce the order with in them.
*/
#define apic_driver(sym) \
static const struct apic *__apicdrivers_##sym __used \
__aligned(sizeof(struct apic *)) \
__section(".apicdrivers") = { &sym }
-#define apic_drivers(sym1, sym2) \
- static struct apic *__apicdrivers_##sym1##sym2[2] __used \
- __aligned(sizeof(struct apic *)) \
- __section(".apicdrivers") = { &sym1, &sym2 }
-
extern struct apic *__apicdrivers[], *__apicdrivers_end[];
/*
@@ -483,7 +479,6 @@ static inline u64 apic_icr_read(void) { return 0; }
static inline void apic_icr_write(u32 low, u32 high) { }
static inline void apic_wait_icr_idle(void) { }
static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
-static inline void apic_set_eoi_cb(void (*eoi)(void)) {}
static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); }
static inline void apic_setup_apic_calls(void) { }
@@ -500,14 +495,17 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector)
return !!(irr & (1U << (vector % 32)));
}
+static inline bool is_vector_pending(unsigned int vector)
+{
+ return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector);
+}
+
/*
* Warm reset vector position:
*/
#define TRAMPOLINE_PHYS_LOW 0x467
#define TRAMPOLINE_PHYS_HIGH 0x469
-extern void generic_bigsmp_probe(void);
-
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/smp.h>
@@ -530,8 +528,6 @@ static inline int default_acpi_madt_oem_check(char *a, char *b) { return 0; }
static inline void x86_64_probe_apic(void) { }
#endif
-extern int default_apic_id_valid(u32 apicid);
-
extern u32 apic_default_calc_apicid(unsigned int cpu);
extern u32 apic_flat_calc_apicid(unsigned int cpu);
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index ca8eed1d496a..2bec0c89a95c 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -229,9 +229,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP);
#define _ASM_EXTABLE_UA(from, to) \
_ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS)
-#define _ASM_EXTABLE_CPY(from, to) \
- _ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY)
-
#define _ASM_EXTABLE_FAULT(from, to) \
_ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 55a55ec04350..55b4d24356ea 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -86,11 +86,7 @@ static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
}
#define arch_atomic_add_return arch_atomic_add_return
-static __always_inline int arch_atomic_sub_return(int i, atomic_t *v)
-{
- return arch_atomic_add_return(-i, v);
-}
-#define arch_atomic_sub_return arch_atomic_sub_return
+#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v)
static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
@@ -98,11 +94,7 @@ static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
}
#define arch_atomic_fetch_add arch_atomic_fetch_add
-static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v)
-{
- return xadd(&v->counter, -i);
-}
-#define arch_atomic_fetch_sub arch_atomic_fetch_sub
+#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v)
static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 3486d91b8595..1f650b4dde50 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -14,6 +14,32 @@ typedef struct {
#define ATOMIC64_INIT(val) { (val) }
+/*
+ * Read an atomic64_t non-atomically.
+ *
+ * This is intended to be used in cases where a subsequent atomic operation
+ * will handle the torn value, and can be used to prime the first iteration
+ * of unconditional try_cmpxchg() loops, e.g.:
+ *
+ * s64 val = arch_atomic64_read_nonatomic(v);
+ * do { } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i);
+ *
+ * This is NOT safe to use where the value is not always checked by a
+ * subsequent atomic operation, such as in conditional try_cmpxchg() loops
+ * that can break before the atomic operation, e.g.:
+ *
+ * s64 val = arch_atomic64_read_nonatomic(v);
+ * do {
+ * if (condition(val))
+ * break;
+ * } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i);
+ */
+static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v)
+{
+ /* See comment in arch_atomic_read(). */
+ return __READ_ONCE(v->counter);
+}
+
#define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...)
#ifndef ATOMIC64_EXPORT
#define ATOMIC64_DECL_ONE __ATOMIC64_DECL
@@ -24,7 +50,7 @@ typedef struct {
#ifdef CONFIG_X86_CMPXCHG64
#define __alternative_atomic64(f, g, out, in...) \
- asm volatile("call %P[func]" \
+ asm volatile("call %c[func]" \
: out : [func] "i" (atomic64_##g##_cx8), ## in)
#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8)
@@ -61,12 +87,18 @@ ATOMIC64_DECL(add_unless);
#undef __ATOMIC64_DECL
#undef ATOMIC64_EXPORT
-static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
- return arch_cmpxchg64(&v->counter, o, n);
+ return arch_cmpxchg64(&v->counter, old, new);
}
#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
+static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
+{
+ return arch_try_cmpxchg64(&v->counter, old, new);
+}
+#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
+
static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
{
s64 o;
@@ -131,20 +163,18 @@ static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v)
}
#define arch_atomic64_dec_return arch_atomic64_dec_return
-static __always_inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
{
__alternative_atomic64(add, add_return,
ASM_OUTPUT2("+A" (i), "+c" (v)),
ASM_NO_INPUT_CLOBBER("memory"));
- return i;
}
-static __always_inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
{
__alternative_atomic64(sub, sub_return,
ASM_OUTPUT2("+A" (i), "+c" (v)),
ASM_NO_INPUT_CLOBBER("memory"));
- return i;
}
static __always_inline void arch_atomic64_inc(atomic64_t *v)
@@ -195,69 +225,62 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
}
static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
- return old;
+ return val;
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and
static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
}
static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
- return old;
+ return val;
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or
static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
}
static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
- return old;
+ return val;
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i));
- return old;
+ return val;
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 3165c0feedf7..ae12acae5b06 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -80,11 +80,7 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
}
#define arch_atomic64_add_return arch_atomic64_add_return
-static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
-{
- return arch_atomic64_add_return(-i, v);
-}
-#define arch_atomic64_sub_return arch_atomic64_sub_return
+#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v)
static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
@@ -92,11 +88,7 @@ static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add
-static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
-{
- return xadd(&v->counter, -i);
-}
-#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
+#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v)
static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 63bdc6b85219..7b44b3c4cce1 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -33,20 +33,16 @@
* Returns:
* 0 - (index < size)
*/
-static __always_inline unsigned long array_index_mask_nospec(unsigned long index,
- unsigned long size)
-{
- unsigned long mask;
-
- asm volatile ("cmp %1,%2; sbb %0,%0;"
- :"=r" (mask)
- :"g"(size),"r" (index)
- :"cc");
- return mask;
-}
-
-/* Override the default implementation from linux/nospec.h. */
-#define array_index_mask_nospec array_index_mask_nospec
+#define array_index_mask_nospec(idx,sz) ({ \
+ typeof((idx)+(sz)) __idx = (idx); \
+ typeof(__idx) __sz = (sz); \
+ unsigned long __mask; \
+ asm volatile ("cmp %1,%2; sbb %0,%0" \
+ :"=r" (__mask) \
+ :ASM_INPUT_G (__sz), \
+ "r" (__idx) \
+ :"cc"); \
+ __mask; })
/* Prevent speculative execution past this barrier. */
#define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 990eb686ca67..b96d45944c59 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -250,7 +250,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word)
{
asm("rep; bsf %1,%0"
: "=r" (word)
- : "rm" (word));
+ : ASM_INPUT_RM (word));
return word;
}
@@ -297,7 +297,7 @@ static __always_inline unsigned long __fls(unsigned long word)
asm("bsr %1,%0"
: "=r" (word)
- : "rm" (word));
+ : ASM_INPUT_RM (word));
return word;
}
@@ -320,7 +320,7 @@ static __always_inline int variable_ffs(int x)
*/
asm("bsfl %1,%0"
: "=r" (r)
- : "rm" (x), "0" (-1));
+ : ASM_INPUT_RM (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
asm("bsfl %1,%0\n\t"
"cmovzl %2,%0"
@@ -377,7 +377,7 @@ static __always_inline int fls(unsigned int x)
*/
asm("bsrl %1,%0"
: "=r" (r)
- : "rm" (x), "0" (-1));
+ : ASM_INPUT_RM (x), "0" (-1));
#elif defined(CONFIG_X86_CMOV)
asm("bsrl %1,%0\n\t"
"cmovzl %2,%0"
@@ -416,7 +416,7 @@ static __always_inline int fls64(__u64 x)
*/
asm("bsrq %1,%q0"
: "+r" (bitpos)
- : "rm" (x));
+ : ASM_INPUT_RM (x));
return bitpos + 1;
}
#else
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index a3e0be0470a4..3e5b111e619d 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -6,11 +6,6 @@
#include <asm/pgtable_types.h>
#include <uapi/asm/boot.h>
-/* Physical address where kernel should be loaded. */
-#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
- + (CONFIG_PHYSICAL_ALIGN - 1)) \
- & ~(CONFIG_PHYSICAL_ALIGN - 1))
-
/* Minimum kernel alignment, as a power of two */
#ifdef CONFIG_X86_64
# define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index a3ec87d198ac..806649c7f23d 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -13,6 +13,18 @@
#define INSN_UD2 0x0b0f
#define LEN_UD2 2
+/*
+ * In clang we have UD1s reporting UBSAN failures on X86, 64 and 32bit.
+ */
+#define INSN_ASOP 0x67
+#define OPCODE_ESCAPE 0x0f
+#define SECOND_BYTE_OPCODE_UD1 0xb9
+#define SECOND_BYTE_OPCODE_UD2 0x0b
+
+#define BUG_NONE 0xffff
+#define BUG_UD1 0xfffe
+#define BUG_UD2 0xfffd
+
#ifdef CONFIG_GENERIC_BUG
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 7cd752557905..31d19c815f99 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -93,7 +93,7 @@
*
*/
enum cfi_mode {
- CFI_DEFAULT, /* FineIBT if hardware has IBT, otherwise kCFI */
+ CFI_AUTO, /* FineIBT if hardware has IBT, otherwise kCFI */
CFI_OFF, /* Taditional / IBT depending on .config */
CFI_KCFI, /* Optionally CALL_PADDING, IBT, RETPOLINE */
CFI_FINEIBT, /* see arch/x86/kernel/alternative.c */
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index 6faaf27e8899..6cbd9ae58b21 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,6 +2,10 @@
#ifndef _ASM_X86_CMDLINE_H
#define _ASM_X86_CMDLINE_H
+#include <asm/setup.h>
+
+extern char builtin_cmdline[COMMAND_LINE_SIZE];
+
int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
int cmdline_find_option(const char *cmdline_ptr, const char *option,
char *buffer, int bufsize);
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index b5731c51f0f4..62cef2113ca7 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -3,103 +3,148 @@
#define _ASM_X86_CMPXCHG_32_H
/*
- * Note: if you use set64_bit(), __cmpxchg64(), or their variants,
+ * Note: if you use __cmpxchg64(), or their variants,
* you need to test for the feature in boot_cpu_data.
*/
-#ifdef CONFIG_X86_CMPXCHG64
-#define arch_cmpxchg64(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
- (unsigned long long)(n)))
-#define arch_cmpxchg64_local(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
- (unsigned long long)(n)))
-#define arch_try_cmpxchg64(ptr, po, n) \
- __try_cmpxchg64((ptr), (unsigned long long *)(po), \
- (unsigned long long)(n))
-#endif
+union __u64_halves {
+ u64 full;
+ struct {
+ u32 low, high;
+ };
+};
+
+#define __arch_cmpxchg64(_ptr, _old, _new, _lock) \
+({ \
+ union __u64_halves o = { .full = (_old), }, \
+ n = { .full = (_new), }; \
+ \
+ asm volatile(_lock "cmpxchg8b %[ptr]" \
+ : [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high) \
+ : "memory"); \
+ \
+ o.full; \
+})
+
+
+static __always_inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
+{
+ return __arch_cmpxchg64(ptr, old, new, LOCK_PREFIX);
+}
-static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
+static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
{
- u64 prev;
- asm volatile(LOCK_PREFIX "cmpxchg8b %1"
- : "=A" (prev),
- "+m" (*ptr)
- : "b" ((u32)new),
- "c" ((u32)(new >> 32)),
- "0" (old)
- : "memory");
- return prev;
+ return __arch_cmpxchg64(ptr, old, new,);
}
-static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
+#define __arch_try_cmpxchg64(_ptr, _oldp, _new, _lock) \
+({ \
+ union __u64_halves o = { .full = *(_oldp), }, \
+ n = { .full = (_new), }; \
+ bool ret; \
+ \
+ asm volatile(_lock "cmpxchg8b %[ptr]" \
+ CC_SET(e) \
+ : CC_OUT(e) (ret), \
+ [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high) \
+ : "memory"); \
+ \
+ if (unlikely(!ret)) \
+ *(_oldp) = o.full; \
+ \
+ likely(ret); \
+})
+
+static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new)
{
- u64 prev;
- asm volatile("cmpxchg8b %1"
- : "=A" (prev),
- "+m" (*ptr)
- : "b" ((u32)new),
- "c" ((u32)(new >> 32)),
- "0" (old)
- : "memory");
- return prev;
+ return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX);
}
-static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
+static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new)
{
- bool success;
- u64 old = *pold;
- asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]"
- CC_SET(z)
- : CC_OUT(z) (success),
- [ptr] "+m" (*ptr),
- "+A" (old)
- : "b" ((u32)new),
- "c" ((u32)(new >> 32))
- : "memory");
-
- if (unlikely(!success))
- *pold = old;
- return success;
+ return __arch_try_cmpxchg64(ptr, oldp, new,);
}
-#ifndef CONFIG_X86_CMPXCHG64
+#ifdef CONFIG_X86_CMPXCHG64
+
+#define arch_cmpxchg64 __cmpxchg64
+
+#define arch_cmpxchg64_local __cmpxchg64_local
+
+#define arch_try_cmpxchg64 __try_cmpxchg64
+
+#define arch_try_cmpxchg64_local __try_cmpxchg64_local
+
+#else
+
/*
* Building a kernel capable running on 80386 and 80486. It may be necessary
* to simulate the cmpxchg8b on the 80386 and 80486 CPU.
*/
-#define arch_cmpxchg64(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- __typeof__(*(ptr)) __old = (o); \
- __typeof__(*(ptr)) __new = (n); \
- alternative_io(LOCK_PREFIX_HERE \
- "call cmpxchg8b_emu", \
- "lock; cmpxchg8b (%%esi)" , \
- X86_FEATURE_CX8, \
- "=A" (__ret), \
- "S" ((ptr)), "0" (__old), \
- "b" ((unsigned int)__new), \
- "c" ((unsigned int)(__new>>32)) \
- : "memory"); \
- __ret; })
-
-
-#define arch_cmpxchg64_local(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- __typeof__(*(ptr)) __old = (o); \
- __typeof__(*(ptr)) __new = (n); \
- alternative_io("call cmpxchg8b_emu", \
- "cmpxchg8b (%%esi)" , \
- X86_FEATURE_CX8, \
- "=A" (__ret), \
- "S" ((ptr)), "0" (__old), \
- "b" ((unsigned int)__new), \
- "c" ((unsigned int)(__new>>32)) \
- : "memory"); \
- __ret; })
+#define __arch_cmpxchg64_emu(_ptr, _old, _new, _lock_loc, _lock) \
+({ \
+ union __u64_halves o = { .full = (_old), }, \
+ n = { .full = (_new), }; \
+ \
+ asm volatile(ALTERNATIVE(_lock_loc \
+ "call cmpxchg8b_emu", \
+ _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \
+ : "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \
+ : "memory"); \
+ \
+ o.full; \
+})
+
+static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
+{
+ return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock; ");
+}
+#define arch_cmpxchg64 arch_cmpxchg64
+
+static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
+{
+ return __arch_cmpxchg64_emu(ptr, old, new, ,);
+}
+#define arch_cmpxchg64_local arch_cmpxchg64_local
+
+#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new, _lock_loc, _lock) \
+({ \
+ union __u64_halves o = { .full = *(_oldp), }, \
+ n = { .full = (_new), }; \
+ bool ret; \
+ \
+ asm volatile(ALTERNATIVE(_lock_loc \
+ "call cmpxchg8b_emu", \
+ _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \
+ CC_SET(e) \
+ : CC_OUT(e) (ret), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \
+ : "memory"); \
+ \
+ if (unlikely(!ret)) \
+ *(_oldp) = o.full; \
+ \
+ likely(ret); \
+})
+
+static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new)
+{
+ return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock; ");
+}
+#define arch_try_cmpxchg64 arch_try_cmpxchg64
+
+static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new)
+{
+ return __arch_try_cmpxchg64_emu(ptr, oldp, new, ,);
+}
+#define arch_try_cmpxchg64_local arch_try_cmpxchg64_local
#endif
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 44b08b53ab32..5e241306db26 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -20,6 +20,12 @@
arch_try_cmpxchg((ptr), (po), (n)); \
})
+#define arch_try_cmpxchg64_local(ptr, po, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_try_cmpxchg_local((ptr), (po), (n)); \
+})
+
union __u128_halves {
u128 full;
struct {
@@ -62,7 +68,7 @@ static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old,
asm volatile(_lock "cmpxchg16b %[ptr]" \
CC_SET(e) \
: CC_OUT(e) (ret), \
- [ptr] "+m" (*ptr), \
+ [ptr] "+m" (*(_ptr)), \
"+a" (o.low), "+d" (o.high) \
: "b" (n.low), "c" (n.high) \
: "memory"); \
diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
index eb8fcede9e3b..e4121d9aa9e1 100644
--- a/arch/x86/include/asm/cpu_device_id.h
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -3,6 +3,39 @@
#define _ASM_X86_CPU_DEVICE_ID
/*
+ * Can't use <linux/bitfield.h> because it generates expressions that
+ * cannot be used in structure initializers. Bitfield construction
+ * here must match the union in struct cpuinfo_86:
+ * union {
+ * struct {
+ * __u8 x86_model;
+ * __u8 x86;
+ * __u8 x86_vendor;
+ * __u8 x86_reserved;
+ * };
+ * __u32 x86_vfm;
+ * };
+ */
+#define VFM_MODEL_BIT 0
+#define VFM_FAMILY_BIT 8
+#define VFM_VENDOR_BIT 16
+#define VFM_RSVD_BIT 24
+
+#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT)
+#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT)
+#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT)
+
+#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT)
+#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT)
+#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT)
+
+#define VFM_MAKE(_vendor, _family, _model) ( \
+ ((_model) << VFM_MODEL_BIT) | \
+ ((_family) << VFM_FAMILY_BIT) | \
+ ((_vendor) << VFM_VENDOR_BIT) \
+)
+
+/*
* Declare drivers belonging to specific x86 CPUs
* Similar in spirit to pci_device_id and related PCI functions
*
@@ -20,6 +53,9 @@
#define X86_CENTAUR_FAM6_C7_D 0xd
#define X86_CENTAUR_FAM6_NANO 0xf
+/* x86_cpu_id::flags */
+#define X86_CPU_ID_FLAG_ENTRY_VALID BIT(0)
+
#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
/**
* X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching
@@ -46,6 +82,18 @@
.model = _model, \
.steppings = _steppings, \
.feature = _feature, \
+ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, \
+ .driver_data = (unsigned long) _data \
+}
+
+#define X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \
+ _steppings, _feature, _data) { \
+ .vendor = _vendor, \
+ .family = _family, \
+ .model = _model, \
+ .steppings = _steppings, \
+ .feature = _feature, \
+ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, \
.driver_data = (unsigned long) _data \
}
@@ -145,24 +193,54 @@
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data)
/**
- * X86_MATCH_INTEL_FAM6_MODEL - Match vendor INTEL, family 6 and model
- * @model: The model name without the INTEL_FAM6_ prefix or ANY
- * The model name is expanded to INTEL_FAM6_@model internally
+ * X86_MATCH_VFM - Match encoded vendor/family/model
+ * @vfm: Encoded 8-bits each for vendor, family, model
* @data: Driver specific data or NULL. The internal storage
* format is unsigned long. The supplied value, pointer
- * etc. is casted to unsigned long internally.
+ * etc. is cast to unsigned long internally.
*
- * The vendor is set to INTEL, the family to 6 and all other missing
- * arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are set to wildcards.
+ * Stepping and feature are set to wildcards
+ */
+#define X86_MATCH_VFM(vfm, data) \
+ X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
+ VFM_VENDOR(vfm), \
+ VFM_FAMILY(vfm), \
+ VFM_MODEL(vfm), \
+ X86_STEPPING_ANY, X86_FEATURE_ANY, data)
+
+/**
+ * X86_MATCH_VFM_STEPPINGS - Match encoded vendor/family/model/stepping
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @steppings: Bitmask of steppings to match
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
*
- * See X86_MATCH_VENDOR_FAM_MODEL_FEATURE() for further information.
+ * feature is set to wildcard
*/
-#define X86_MATCH_INTEL_FAM6_MODEL(model, data) \
- X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data)
+#define X86_MATCH_VFM_STEPPINGS(vfm, steppings, data) \
+ X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
+ VFM_VENDOR(vfm), \
+ VFM_FAMILY(vfm), \
+ VFM_MODEL(vfm), \
+ steppings, X86_FEATURE_ANY, data)
-#define X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(model, steppings, data) \
- X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, INTEL_FAM6_##model, \
- steppings, X86_FEATURE_ANY, data)
+/**
+ * X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @feature: A X86_FEATURE bit
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
+ *
+ * Steppings is set to wildcard
+ */
+#define X86_MATCH_VFM_FEATURE(vfm, feature, data) \
+ X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
+ VFM_VENDOR(vfm), \
+ VFM_FAMILY(vfm), \
+ VFM_MODEL(vfm), \
+ X86_STEPPING_ANY, feature, data)
/*
* Match specific microcode revisions.
@@ -182,9 +260,17 @@ struct x86_cpu_desc {
u32 x86_microcode_rev;
};
-#define INTEL_CPU_DESC(model, stepping, revision) { \
- .x86_family = 6, \
- .x86_vendor = X86_VENDOR_INTEL, \
+#define INTEL_CPU_DESC(vfm, stepping, revision) { \
+ .x86_family = VFM_FAMILY(vfm), \
+ .x86_vendor = VFM_VENDOR(vfm), \
+ .x86_model = VFM_MODEL(vfm), \
+ .x86_stepping = (stepping), \
+ .x86_microcode_rev = (revision), \
+}
+
+#define AMD_CPU_DESC(fam, model, stepping, revision) { \
+ .x86_family = (fam), \
+ .x86_vendor = X86_VENDOR_AMD, \
.x86_model = (model), \
.x86_stepping = (stepping), \
.x86_microcode_rev = (revision), \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 686e92d2663e..0b9611da6c53 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -129,8 +129,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
#define this_cpu_has(bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
- x86_this_cpu_test_bit(bit, \
- (unsigned long __percpu *)&cpu_info.x86_capability))
+ x86_this_cpu_test_bit(bit, cpu_info.x86_capability))
/*
* This macro is for detection of features which need kernel
@@ -150,8 +149,12 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
extern void setup_clear_cpu_cap(unsigned int bit);
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
-#define setup_force_cpu_cap(bit) do { \
- set_cpu_cap(&boot_cpu_data, bit); \
+#define setup_force_cpu_cap(bit) do { \
+ \
+ if (!boot_cpu_has(bit)) \
+ WARN_ON(alternatives_patched); \
+ \
+ set_cpu_cap(&boot_cpu_data, bit); \
set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0)
@@ -172,11 +175,10 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
*/
static __always_inline bool _static_cpu_has(u16 bit)
{
- asm goto(
- ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]")
+ asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
".pushsection .altinstr_aux,\"ax\"\n"
"6:\n"
- " testb %[bitnum]," _ASM_RIP(%P[cap_byte]) "\n"
+ " testb %[bitnum], %a[cap_byte]\n"
" jnz %l[t_yes]\n"
" jmp %l[t_no]\n"
".popsection\n"
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 3c7434329661..dd4682857c12 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -18,170 +18,170 @@
/*
* Note: If the comment begins with a quoted string, that string is used
- * in /proc/cpuinfo instead of the macro name. If the string is "",
- * this feature bit is not displayed in /proc/cpuinfo at all.
+ * in /proc/cpuinfo instead of the macro name. Otherwise, this feature
+ * bit is not displayed in /proc/cpuinfo at all.
*
* When adding new features here that depend on other features,
* please update the table in kernel/cpu/cpuid-deps.c as well.
*/
/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */
-#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
-#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
-#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */
-#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_FPU ( 0*32+ 0) /* "fpu" Onboard FPU */
+#define X86_FEATURE_VME ( 0*32+ 1) /* "vme" Virtual Mode Extensions */
+#define X86_FEATURE_DE ( 0*32+ 2) /* "de" Debugging Extensions */
+#define X86_FEATURE_PSE ( 0*32+ 3) /* "pse" Page Size Extensions */
+#define X86_FEATURE_TSC ( 0*32+ 4) /* "tsc" Time Stamp Counter */
+#define X86_FEATURE_MSR ( 0*32+ 5) /* "msr" Model-Specific Registers */
+#define X86_FEATURE_PAE ( 0*32+ 6) /* "pae" Physical Address Extensions */
+#define X86_FEATURE_MCE ( 0*32+ 7) /* "mce" Machine Check Exception */
+#define X86_FEATURE_CX8 ( 0*32+ 8) /* "cx8" CMPXCHG8 instruction */
+#define X86_FEATURE_APIC ( 0*32+ 9) /* "apic" Onboard APIC */
+#define X86_FEATURE_SEP ( 0*32+11) /* "sep" SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR ( 0*32+12) /* "mtrr" Memory Type Range Registers */
+#define X86_FEATURE_PGE ( 0*32+13) /* "pge" Page Global Enable */
+#define X86_FEATURE_MCA ( 0*32+14) /* "mca" Machine Check Architecture */
+#define X86_FEATURE_CMOV ( 0*32+15) /* "cmov" CMOV instructions (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT ( 0*32+16) /* "pat" Page Attribute Table */
+#define X86_FEATURE_PSE36 ( 0*32+17) /* "pse36" 36-bit PSEs */
+#define X86_FEATURE_PN ( 0*32+18) /* "pn" Processor serial number */
+#define X86_FEATURE_CLFLUSH ( 0*32+19) /* "clflush" CLFLUSH instruction */
#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
-#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_ACPI ( 0*32+22) /* "acpi" ACPI via MSR */
+#define X86_FEATURE_MMX ( 0*32+23) /* "mmx" Multimedia Extensions */
+#define X86_FEATURE_FXSR ( 0*32+24) /* "fxsr" FXSAVE/FXRSTOR, CR4.OSFXSR */
#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
-#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_HT ( 0*32+28) /* "ht" Hyper-Threading */
#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
-#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
-#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
+#define X86_FEATURE_IA64 ( 0*32+30) /* "ia64" IA-64 processor */
+#define X86_FEATURE_PBE ( 0*32+31) /* "pbe" Pending Break Enable */
/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */
-#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_SYSCALL ( 1*32+11) /* "syscall" SYSCALL/SYSRET */
+#define X86_FEATURE_MP ( 1*32+19) /* "mp" MP Capable */
+#define X86_FEATURE_NX ( 1*32+20) /* "nx" Execute Disable */
+#define X86_FEATURE_MMXEXT ( 1*32+22) /* "mmxext" AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* "fxsr_opt" FXSAVE/FXRSTOR optimizations */
#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
-#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */
-#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */
-#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */
+#define X86_FEATURE_RDTSCP ( 1*32+27) /* "rdtscp" RDTSCP */
+#define X86_FEATURE_LM ( 1*32+29) /* "lm" Long Mode (x86-64, 64-bit support) */
+#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* "3dnowext" AMD 3DNow extensions */
+#define X86_FEATURE_3DNOW ( 1*32+31) /* "3dnow" 3DNow */
/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
-#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
-#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
-#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
+#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* "recovery" CPU in recovery mode */
+#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* "longrun" Longrun power control */
+#define X86_FEATURE_LRTI ( 2*32+ 3) /* "lrti" LongRun table interface */
/* Other features, Linux-defined mapping, word 3 */
/* This range is used for feature bits which conflict or are synthesized */
-#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
-#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
-#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
-#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
-#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* "" CPU based on Zen5 microarchitecture */
-#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
-#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
-#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */
-#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */
-#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
-#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
-#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
-#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
-#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
-#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */
-#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */
-#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
-#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
-#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */
-#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
-#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
-#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */
-#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */
-#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */
-#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
-#define X86_FEATURE_RAPL ( 3*32+29) /* AMD/Hygon RAPL interface */
-#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
+#define X86_FEATURE_CXMMX ( 3*32+ 0) /* "cxmmx" Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* "k6_mtrr" AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* "cyrix_arr" Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */
+#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */
+#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */
+#define X86_FEATURE_P3 ( 3*32+ 6) /* P3 */
+#define X86_FEATURE_P4 ( 3*32+ 7) /* P4 */
+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */
+#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */
+#define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */
+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* "arch_perfmon" Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS ( 3*32+12) /* "pebs" Precise-Event Based Sampling */
+#define X86_FEATURE_BTS ( 3*32+13) /* "bts" Branch Trace Store */
+#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* syscall in IA32 userspace */
+#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* sysenter in IA32 userspace */
+#define X86_FEATURE_REP_GOOD ( 3*32+16) /* "rep_good" REP microcode works well */
+#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* "amd_lbr_v2" AMD Last Branch Record Extension Version 2 */
+#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* Clear CPU buffers using VERW */
+#define X86_FEATURE_ACC_POWER ( 3*32+19) /* "acc_power" AMD Accumulated Power Mechanism */
+#define X86_FEATURE_NOPL ( 3*32+20) /* "nopl" The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS ( 3*32+21) /* Always-present feature */
+#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* "xtopology" CPU topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* "tsc_reliable" TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* "nonstop_tsc" TSC does not stop in C states */
+#define X86_FEATURE_CPUID ( 3*32+25) /* "cpuid" CPU has CPUID instruction itself */
+#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* "extd_apicid" Extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM ( 3*32+27) /* "amd_dcm" AMD multi-node processor */
+#define X86_FEATURE_APERFMPERF ( 3*32+28) /* "aperfmperf" P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
+#define X86_FEATURE_RAPL ( 3*32+29) /* "rapl" AMD/Hygon RAPL interface */
+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* "nonstop_tsc_s3" TSC doesn't stop in S3 state */
+#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* "tsc_known_freq" TSC has known frequency */
/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */
#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
-#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
-#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* "pclmulqdq" PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64 ( 4*32+ 2) /* "dtes64" 64-bit Debug Store */
#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */
#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */
-#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
-#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */
-#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
-#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
-#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
-#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
-#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */
-#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
-#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */
-#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
-#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_VMX ( 4*32+ 5) /* "vmx" Hardware virtualization */
+#define X86_FEATURE_SMX ( 4*32+ 6) /* "smx" Safer Mode eXtensions */
+#define X86_FEATURE_EST ( 4*32+ 7) /* "est" Enhanced SpeedStep */
+#define X86_FEATURE_TM2 ( 4*32+ 8) /* "tm2" Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* "ssse3" Supplemental SSE-3 */
+#define X86_FEATURE_CID ( 4*32+10) /* "cid" Context ID */
+#define X86_FEATURE_SDBG ( 4*32+11) /* "sdbg" Silicon Debug */
+#define X86_FEATURE_FMA ( 4*32+12) /* "fma" Fused multiply-add */
+#define X86_FEATURE_CX16 ( 4*32+13) /* "cx16" CMPXCHG16B instruction */
+#define X86_FEATURE_XTPR ( 4*32+14) /* "xtpr" Send Task Priority Messages */
+#define X86_FEATURE_PDCM ( 4*32+15) /* "pdcm" Perf/Debug Capabilities MSR */
+#define X86_FEATURE_PCID ( 4*32+17) /* "pcid" Process Context Identifiers */
+#define X86_FEATURE_DCA ( 4*32+18) /* "dca" Direct Cache Access */
#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
-#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */
-#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
-#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
-#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */
-#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
-#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */
-#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */
-#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
-#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */
-#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */
-#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
+#define X86_FEATURE_X2APIC ( 4*32+21) /* "x2apic" X2APIC */
+#define X86_FEATURE_MOVBE ( 4*32+22) /* "movbe" MOVBE instruction */
+#define X86_FEATURE_POPCNT ( 4*32+23) /* "popcnt" POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* "tsc_deadline_timer" TSC deadline timer */
+#define X86_FEATURE_AES ( 4*32+25) /* "aes" AES instructions */
+#define X86_FEATURE_XSAVE ( 4*32+26) /* "xsave" XSAVE/XRSTOR/XSETBV/XGETBV instructions */
+#define X86_FEATURE_OSXSAVE ( 4*32+27) /* XSAVE instruction enabled in the OS */
+#define X86_FEATURE_AVX ( 4*32+28) /* "avx" Advanced Vector Extensions */
+#define X86_FEATURE_F16C ( 4*32+29) /* "f16c" 16-bit FP conversions */
+#define X86_FEATURE_RDRAND ( 4*32+30) /* "rdrand" RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* "hypervisor" Running on a hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
-#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
-#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
+#define X86_FEATURE_ACE2 ( 5*32+ 8) /* "ace2" Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* "ace2_en" ACE v2 enabled */
+#define X86_FEATURE_PHE ( 5*32+10) /* "phe" PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN ( 5*32+11) /* "phe_en" PHE enabled */
+#define X86_FEATURE_PMM ( 5*32+12) /* "pmm" PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN ( 5*32+13) /* "pmm_en" PMM enabled */
/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */
-#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
-#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */
-#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
-#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
-#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
-#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
-#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
-#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
-#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
-#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
-#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
-#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
-#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
-#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
-#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */
-#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
-#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */
-#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */
-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */
-#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
-#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */
-#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */
-#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
-#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */
+#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* "lahf_lm" LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* "cmp_legacy" If yes HyperThreading not valid */
+#define X86_FEATURE_SVM ( 6*32+ 2) /* "svm" Secure Virtual Machine */
+#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* "extapic" Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* "cr8_legacy" CR8 in 32-bit mode */
+#define X86_FEATURE_ABM ( 6*32+ 5) /* "abm" Advanced bit manipulation */
+#define X86_FEATURE_SSE4A ( 6*32+ 6) /* "sse4a" SSE-4A */
+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* "misalignsse" Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* "3dnowprefetch" 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW ( 6*32+ 9) /* "osvw" OS Visible Workaround */
+#define X86_FEATURE_IBS ( 6*32+10) /* "ibs" Instruction Based Sampling */
+#define X86_FEATURE_XOP ( 6*32+11) /* "xop" Extended AVX instructions */
+#define X86_FEATURE_SKINIT ( 6*32+12) /* "skinit" SKINIT/STGI instructions */
+#define X86_FEATURE_WDT ( 6*32+13) /* "wdt" Watchdog timer */
+#define X86_FEATURE_LWP ( 6*32+15) /* "lwp" Light Weight Profiling */
+#define X86_FEATURE_FMA4 ( 6*32+16) /* "fma4" 4 operands MAC instructions */
+#define X86_FEATURE_TCE ( 6*32+17) /* "tce" Translation Cache Extension */
+#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* "nodeid_msr" NodeId MSR */
+#define X86_FEATURE_TBM ( 6*32+21) /* "tbm" Trailing Bit Manipulations */
+#define X86_FEATURE_TOPOEXT ( 6*32+22) /* "topoext" Topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* "perfctr_core" Core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* "perfctr_nb" NB performance counter extensions */
+#define X86_FEATURE_BPEXT ( 6*32+26) /* "bpext" Data breakpoint extension */
+#define X86_FEATURE_PTSC ( 6*32+27) /* "ptsc" Performance time-stamp counter */
+#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* "perfctr_llc" Last Level Cache performance counter extensions */
+#define X86_FEATURE_MWAITX ( 6*32+29) /* "mwaitx" MWAIT extension (MONITORX/MWAITX instructions) */
/*
* Auxiliary flags: Linux defined - For features scattered in various
@@ -189,93 +189,93 @@
*
* Reuse free bits when adding new feature flags!
*/
-#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */
-#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
-#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
-#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
-#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
-#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
-#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* Platform supports being a TDX host */
-#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
-#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */
-#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
-#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */
-#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */
-#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
-#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */
-#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
-#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
-#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
-#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
-#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */
-#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
-#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
-#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
-#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */
-#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */
-#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
-#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
-#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */
-#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
-#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */
-#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */
+#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* "ring3mwait" Ring 3 MONITOR/MWAIT instructions */
+#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* "cpuid_fault" Intel CPUID faulting */
+#define X86_FEATURE_CPB ( 7*32+ 2) /* "cpb" AMD Core Performance Boost */
+#define X86_FEATURE_EPB ( 7*32+ 3) /* "epb" IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* "cat_l3" Cache Allocation Technology L3 */
+#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* "cat_l2" Cache Allocation Technology L2 */
+#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* "cdp_l3" Code and Data Prioritization L3 */
+#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* "tdx_host_platform" Platform supports being a TDX host */
+#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* "hw_pstate" AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* "proc_feedback" AMD ProcFeedbackInterface */
+#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* Use compacted XSTATE (XSAVES or XSAVEC) */
+#define X86_FEATURE_PTI ( 7*32+11) /* "pti" Kernel Page Table Isolation enabled */
+#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* Set/clear IBRS on kernel entry/exit */
+#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* Fill RSB on VM-Exit */
+#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* "intel_ppin" Intel Processor Inventory Number */
+#define X86_FEATURE_CDP_L2 ( 7*32+15) /* "cdp_l2" Code and Data Prioritization L2 */
+#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* MSR SPEC_CTRL is implemented */
+#define X86_FEATURE_SSBD ( 7*32+17) /* "ssbd" Speculative Store Bypass Disable */
+#define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */
+#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
+#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */
+#define X86_FEATURE_USE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled */
+#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */
+#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */
+#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */
+#define X86_FEATURE_IBRS ( 7*32+25) /* "ibrs" Indirect Branch Restricted Speculation */
+#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier */
+#define X86_FEATURE_STIBP ( 7*32+27) /* "stibp" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_ZEN ( 7*32+28) /* Generic flag for all Zen and newer */
+#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* L1TF workaround PTE inversion */
+#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* "ibrs_enhanced" Enhanced IBRS */
+#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* MSR IA32_FEAT_CTL configured */
/* Virtualization flags: Linux defined, word 8 */
-#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */
-#define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */
+#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* "tpr_shadow" Intel TPR Shadow */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* "flexpriority" Intel FlexPriority */
+#define X86_FEATURE_EPT ( 8*32+ 2) /* "ept" Intel Extended Page Table */
+#define X86_FEATURE_VPID ( 8*32+ 3) /* "vpid" Intel Virtual Processor ID */
-#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */
-#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
-#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
-#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
-#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
-#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */
-#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */
-#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */
+#define X86_FEATURE_VMMCALL ( 8*32+15) /* "vmmcall" Prefer VMMCALL to VMCALL */
+#define X86_FEATURE_XENPV ( 8*32+16) /* Xen paravirtual guest */
+#define X86_FEATURE_EPT_AD ( 8*32+17) /* "ept_ad" Intel Extended Page Table access-dirty bit */
+#define X86_FEATURE_VMCALL ( 8*32+18) /* Hypervisor supports the VMCALL instruction */
+#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* VMware prefers VMMCALL hypercall instruction */
+#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* PV unlock function */
+#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* PV vcpu_is_preempted function */
+#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* "tdx_guest" Intel Trust Domain Extensions Guest */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
-#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
-#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */
-#define X86_FEATURE_SGX ( 9*32+ 2) /* Software Guard Extensions */
-#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
-#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
-#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
-#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */
-#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
-#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
-#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */
-#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
-#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
-#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
-#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */
-#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
-#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
-#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
-#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
-#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */
-#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */
-#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
-#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
-#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
-#define X86_FEATURE_INTEL_PT ( 9*32+25) /* Intel Processor Trace */
-#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
-#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
-#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
-#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
-#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
-#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
+#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* "fsgsbase" RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
+#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* "tsc_adjust" TSC adjustment MSR 0x3B */
+#define X86_FEATURE_SGX ( 9*32+ 2) /* "sgx" Software Guard Extensions */
+#define X86_FEATURE_BMI1 ( 9*32+ 3) /* "bmi1" 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE ( 9*32+ 4) /* "hle" Hardware Lock Elision */
+#define X86_FEATURE_AVX2 ( 9*32+ 5) /* "avx2" AVX2 instructions */
+#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* FPU data pointer updated only on x87 exceptions */
+#define X86_FEATURE_SMEP ( 9*32+ 7) /* "smep" Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2 ( 9*32+ 8) /* "bmi2" 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS ( 9*32+ 9) /* "erms" Enhanced REP MOVSB/STOSB instructions */
+#define X86_FEATURE_INVPCID ( 9*32+10) /* "invpcid" Invalidate Processor Context ID */
+#define X86_FEATURE_RTM ( 9*32+11) /* "rtm" Restricted Transactional Memory */
+#define X86_FEATURE_CQM ( 9*32+12) /* "cqm" Cache QoS Monitoring */
+#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* Zero out FPU CS and FPU DS */
+#define X86_FEATURE_MPX ( 9*32+14) /* "mpx" Memory Protection Extension */
+#define X86_FEATURE_RDT_A ( 9*32+15) /* "rdt_a" Resource Director Technology Allocation */
+#define X86_FEATURE_AVX512F ( 9*32+16) /* "avx512f" AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ ( 9*32+17) /* "avx512dq" AVX-512 DQ (Double/Quad granular) Instructions */
+#define X86_FEATURE_RDSEED ( 9*32+18) /* "rdseed" RDSEED instruction */
+#define X86_FEATURE_ADX ( 9*32+19) /* "adx" ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP ( 9*32+20) /* "smap" Supervisor Mode Access Prevention */
+#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* "avx512ifma" AVX-512 Integer Fused Multiply-Add instructions */
+#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* "clflushopt" CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB ( 9*32+24) /* "clwb" CLWB instruction */
+#define X86_FEATURE_INTEL_PT ( 9*32+25) /* "intel_pt" Intel Processor Trace */
+#define X86_FEATURE_AVX512PF ( 9*32+26) /* "avx512pf" AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER ( 9*32+27) /* "avx512er" AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD ( 9*32+28) /* "avx512cd" AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI ( 9*32+29) /* "sha_ni" SHA1/SHA256 Instruction Extensions */
+#define X86_FEATURE_AVX512BW ( 9*32+30) /* "avx512bw" AVX-512 BW (Byte/Word granular) Instructions */
+#define X86_FEATURE_AVX512VL ( 9*32+31) /* "avx512vl" AVX-512 VL (128/256 Vector Length) Extensions */
/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */
-#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */
-#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */
-#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */
-#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */
-#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */
+#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* "xsaveopt" XSAVEOPT instruction */
+#define X86_FEATURE_XSAVEC (10*32+ 1) /* "xsavec" XSAVEC instruction */
+#define X86_FEATURE_XGETBV1 (10*32+ 2) /* "xgetbv1" XGETBV with ECX = 1 instruction */
+#define X86_FEATURE_XSAVES (10*32+ 3) /* "xsaves" XSAVES/XRSTORS instructions */
+#define X86_FEATURE_XFD (10*32+ 4) /* eXtended Feature Disabling */
/*
* Extended auxiliary flags: Linux defined - for features scattered in various
@@ -283,181 +283,183 @@
*
* Reuse free bits when adding new feature flags!
*/
-#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */
-#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */
-#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */
-#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */
-#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */
-#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
-#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */
-#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
-#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */
-#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
-#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* "" Issue an IBPB on kernel entry */
-#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */
-#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
-#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */
-#define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */
-#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */
-#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */
-#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */
-#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */
-#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */
-#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
-#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
-#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
-#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */
-#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */
-#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */
-#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */
-#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */
-#define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */
-#define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */
-#define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */
-#define X86_FEATURE_ZEN1 (11*32+31) /* "" CPU based on Zen1 microarchitecture */
+#define X86_FEATURE_CQM_LLC (11*32+ 0) /* "cqm_llc" LLC QoS if 1 */
+#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* "cqm_occup_llc" LLC occupancy monitoring */
+#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* "cqm_mbm_total" LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* "cqm_mbm_local" LLC Local MBM monitoring */
+#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* LFENCE in user entry SWAPGS path */
+#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* LFENCE in kernel entry SWAPGS path */
+#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* "split_lock_detect" #AC for split lock */
+#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1 (11*32+ 8) /* Basic SGX */
+#define X86_FEATURE_SGX2 (11*32+ 9) /* SGX Enclave Dynamic Memory Management (EDMM) */
+#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* Issue an IBPB on kernel entry */
+#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* RET prediction control */
+#define X86_FEATURE_RETPOLINE (11*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */
+#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* Use LFENCE for Spectre variant 2 */
+#define X86_FEATURE_RETHUNK (11*32+14) /* Use REturn THUNK */
+#define X86_FEATURE_UNRET (11*32+15) /* AMD BTB untrain return */
+#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* Use IBPB during runtime firmware calls */
+#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* Fill RSB on VM exit when EIBRS is enabled */
+#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* SGX EDECCSSA user leaf function */
+#define X86_FEATURE_CALL_DEPTH (11*32+19) /* Call depth tracking for RSB stuffing */
+#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* MSR IA32_TSX_CTRL (Intel) implemented */
+#define X86_FEATURE_SMBA (11*32+21) /* Slow Memory Bandwidth Allocation */
+#define X86_FEATURE_BMEC (11*32+22) /* Bandwidth Monitoring Event Configuration */
+#define X86_FEATURE_USER_SHSTK (11*32+23) /* "user_shstk" Shadow stack support for user mode applications */
+#define X86_FEATURE_SRSO (11*32+24) /* AMD BTB untrain RETs */
+#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* AMD BTB untrain RETs through aliasing */
+#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* Issue an IBPB only on VMEXIT */
+#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* IA32_TSC_DEADLINE and X2APIC MSRs need fencing */
+#define X86_FEATURE_ZEN2 (11*32+28) /* CPU based on Zen2 microarchitecture */
+#define X86_FEATURE_ZEN3 (11*32+29) /* CPU based on Zen3 microarchitecture */
+#define X86_FEATURE_ZEN4 (11*32+30) /* CPU based on Zen4 microarchitecture */
+#define X86_FEATURE_ZEN1 (11*32+31) /* CPU based on Zen1 microarchitecture */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
-#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
-#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
-#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */
-#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* "" Intel Architectural PerfMon Extension */
-#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */
-#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */
-#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */
-#define X86_FEATURE_FRED (12*32+17) /* Flexible Return and Event Delivery */
-#define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */
-#define X86_FEATURE_WRMSRNS (12*32+19) /* "" Non-serializing WRMSR */
-#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */
-#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
-#define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */
+#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* "avx_vnni" AVX VNNI instructions */
+#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */
+#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* CMPccXADD instructions */
+#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* Intel Architectural PerfMon Extension */
+#define X86_FEATURE_FZRM (12*32+10) /* Fast zero-length REP MOVSB */
+#define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */
+#define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */
+#define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */
+#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */
+#define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */
+#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */
+#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */
+#define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
-#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
-#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
-#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
-#define X86_FEATURE_RDPRU (13*32+ 4) /* Read processor register at user level */
-#define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */
-#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
-#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
-#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
-#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* "" Single Thread Indirect Branch Predictors always-on preferred */
-#define X86_FEATURE_AMD_PPIN (13*32+23) /* Protected Processor Inventory Number */
-#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
-#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
-#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
-#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */
-#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */
-#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */
-#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */
+#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */
+#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */
+#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */
+#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */
+#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */
+#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
+#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
+#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */
+#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */
+#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */
+#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */
+#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* Speculative Store Bypass is fixed in hardware. */
+#define X86_FEATURE_CPPC (13*32+27) /* "cppc" Collaborative Processor Performance Control */
+#define X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
+#define X86_FEATURE_BTC_NO (13*32+29) /* Not vulnerable to Branch Type Confusion */
+#define X86_FEATURE_BRS (13*32+31) /* "brs" Branch Sampling available */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
-#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
-#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
-#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
-#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
-#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
-#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
-#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
-#define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */
+#define X86_FEATURE_DTHERM (14*32+ 0) /* "dtherm" Digital Thermal Sensor */
+#define X86_FEATURE_IDA (14*32+ 1) /* "ida" Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT (14*32+ 2) /* "arat" Always Running APIC Timer */
+#define X86_FEATURE_PLN (14*32+ 4) /* "pln" Intel Power Limit Notification */
+#define X86_FEATURE_PTS (14*32+ 6) /* "pts" Intel Package Thermal Status */
+#define X86_FEATURE_HWP (14*32+ 7) /* "hwp" Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* "hwp_notify" HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* "hwp_act_window" HWP Activity Window */
+#define X86_FEATURE_HWP_EPP (14*32+10) /* "hwp_epp" HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* "hwp_pkg_req" HWP Package Level Request */
+#define X86_FEATURE_HWP_HIGHEST_PERF_CHANGE (14*32+15) /* HWP Highest perf change */
+#define X86_FEATURE_HFI (14*32+19) /* "hfi" Hardware Feedback Interface */
/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
-#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
-#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_NPT (15*32+ 0) /* "npt" Nested Page Table support */
+#define X86_FEATURE_LBRV (15*32+ 1) /* "lbrv" LBR Virtualization support */
#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
-#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
-#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
-#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
-#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */
-#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */
-#define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */
-#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* "flushbyasid" Flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* "decodeassists" Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* "pausefilter" Filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* "pfthreshold" Pause filter threshold */
+#define X86_FEATURE_AVIC (15*32+13) /* "avic" Virtual Interrupt Controller */
+#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* "v_vmsave_vmload" Virtual VMSAVE VMLOAD */
+#define X86_FEATURE_VGIF (15*32+16) /* "vgif" Virtual GIF */
+#define X86_FEATURE_X2AVIC (15*32+18) /* "x2avic" Virtual x2apic */
+#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */
+#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */
+#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
-#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
-#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */
-#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
-#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
-#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */
-#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
-#define X86_FEATURE_SHSTK (16*32+ 7) /* "" Shadow stack */
-#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
-#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
-#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */
-#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */
-#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
-#define X86_FEATURE_TME (16*32+13) /* Intel Total Memory Encryption */
-#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
-#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
-#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
-#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */
-#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */
-#define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */
-#define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */
-#define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */
-#define X86_FEATURE_SGX_LC (16*32+30) /* Software Guard Extensions Launch Control */
+#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* "avx512vbmi" AVX512 Vector Bit Manipulation instructions*/
+#define X86_FEATURE_UMIP (16*32+ 2) /* "umip" User Mode Instruction Protection */
+#define X86_FEATURE_PKU (16*32+ 3) /* "pku" Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE (16*32+ 4) /* "ospke" OS Protection Keys Enable */
+#define X86_FEATURE_WAITPKG (16*32+ 5) /* "waitpkg" UMONITOR/UMWAIT/TPAUSE Instructions */
+#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* "avx512_vbmi2" Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_SHSTK (16*32+ 7) /* Shadow stack */
+#define X86_FEATURE_GFNI (16*32+ 8) /* "gfni" Galois Field New Instructions */
+#define X86_FEATURE_VAES (16*32+ 9) /* "vaes" Vector AES */
+#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* "vpclmulqdq" Carry-Less Multiplication Double Quadword */
+#define X86_FEATURE_AVX512_VNNI (16*32+11) /* "avx512_vnni" Vector Neural Network Instructions */
+#define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
+#define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */
+#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */
+#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */
+#define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */
+#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */
+#define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */
+#define X86_FEATURE_MOVDIRI (16*32+27) /* "movdiri" MOVDIRI instruction */
+#define X86_FEATURE_MOVDIR64B (16*32+28) /* "movdir64b" MOVDIR64B instruction */
+#define X86_FEATURE_ENQCMD (16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */
+#define X86_FEATURE_SGX_LC (16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */
/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
-#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */
-#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */
-#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */
+#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */
+#define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */
+#define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
-#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
-#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
-#define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */
-#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */
-#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */
-#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
-#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* "" RTM transaction always aborts */
-#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
-#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */
-#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */
-#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */
-#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
-#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */
-#define X86_FEATURE_IBT (18*32+20) /* Indirect Branch Tracking */
-#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */
-#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */
-#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */
-#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */
-#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
-#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
-#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
-#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
-#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */
-#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
+#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* "avx512_4vnniw" AVX-512 Neural Network Instructions */
+#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* "avx512_4fmaps" AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_FSRM (18*32+ 4) /* "fsrm" Fast Short Rep Mov */
+#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* "avx512_vp2intersect" AVX-512 Intersect for D/Q */
+#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* SRBDS mitigation MSR available */
+#define X86_FEATURE_MD_CLEAR (18*32+10) /* "md_clear" VERW clears CPU buffers */
+#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* RTM transaction always aborts */
+#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* TSX_FORCE_ABORT */
+#define X86_FEATURE_SERIALIZE (18*32+14) /* "serialize" SERIALIZE instruction */
+#define X86_FEATURE_HYBRID_CPU (18*32+15) /* This part has CPUs of more than one type */
+#define X86_FEATURE_TSXLDTRK (18*32+16) /* "tsxldtrk" TSX Suspend Load Address Tracking */
+#define X86_FEATURE_PCONFIG (18*32+18) /* "pconfig" Intel PCONFIG */
+#define X86_FEATURE_ARCH_LBR (18*32+19) /* "arch_lbr" Intel ARCH LBR */
+#define X86_FEATURE_IBT (18*32+20) /* "ibt" Indirect Branch Tracking */
+#define X86_FEATURE_AMX_BF16 (18*32+22) /* "amx_bf16" AMX bf16 Support */
+#define X86_FEATURE_AVX512_FP16 (18*32+23) /* "avx512_fp16" AVX512 FP16 */
+#define X86_FEATURE_AMX_TILE (18*32+24) /* "amx_tile" AMX tile Support */
+#define X86_FEATURE_AMX_INT8 (18*32+25) /* "amx_int8" AMX int8 Support */
+#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */
+#define X86_FEATURE_INTEL_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_FLUSH_L1D (18*32+28) /* "flush_l1d" Flush L1D cache */
+#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* "arch_capabilities" IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* IA32_CORE_CAPABILITIES MSR */
+#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* Speculative Store Bypass Disable */
/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */
-#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */
-#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */
-#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */
-#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
-#define X86_FEATURE_SEV_SNP (19*32+ 4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */
-#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */
-#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
-#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */
+#define X86_FEATURE_SME (19*32+ 0) /* "sme" AMD Secure Memory Encryption */
+#define X86_FEATURE_SEV (19*32+ 1) /* "sev" AMD Secure Encrypted Virtualization */
+#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */
+#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" AMD Secure Encrypted Virtualization - Secure Nested Paging */
+#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */
+#define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */
+#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */
+#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */
/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
-#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */
-#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
-#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */
-#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */
-#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */
-#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */
+#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */
+#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
+#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */
+#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */
+#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */
+#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */
-#define X86_FEATURE_SBPB (20*32+27) /* "" Selective Branch Prediction Barrier */
-#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */
-#define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */
+#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */
+#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
+#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */
/*
* Extended auxiliary flags: Linux defined - for features scattered in various
@@ -465,59 +467,60 @@
*
* Reuse free bits when adding new feature flags!
*/
-#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */
-#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */
-#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */
-#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */
-#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */
+#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* "amd_lbr_pmc_freeze" AMD LBR and PMC Freeze */
+#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* Clear branch history at syscall entry using SW loop */
+#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */
+#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */
+#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */
+#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */
/*
* BUG word(s)
*/
#define X86_BUG(x) (NCAPINTS*32 + (x))
-#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
-#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
-#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
+#define X86_BUG_F00F X86_BUG(0) /* "f00f" Intel F00F */
+#define X86_BUG_FDIV X86_BUG(1) /* "fdiv" FPU FDIV */
+#define X86_BUG_COMA X86_BUG(2) /* "coma" Cyrix 6x86 coma */
#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
-#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
-#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
-#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
-#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#define X86_BUG_11AP X86_BUG(5) /* "11ap" Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* "fxsave_leak" FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* "clflush_monitor" AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* "sysret_ss_attrs" SYSRET doesn't fix up SS attrs */
#ifdef CONFIG_X86_32
/*
* 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional
* to avoid confusion.
*/
-#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
+#define X86_BUG_ESPFIX X86_BUG(9) /* IRET to 16-bit SS corrupts ESP/RSP high bits */
#endif
-#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */
-#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
-#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
-#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
-#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
-#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
-#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
-#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
-#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
-#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
-#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */
-#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */
-#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
-#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
-#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */
-#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */
-#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */
-#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */
-#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
-#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */
-#define X86_BUG_GDS X86_BUG(30) /* CPU is affected by Gather Data Sampling */
-#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* CPU may incur #MC if non-TD software does partial write to TDX private memory */
+#define X86_BUG_NULL_SEG X86_BUG(10) /* "null_seg" Nulling a selector preserves the base */
+#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* "swapgs_fence" SWAPGS without input dep on GS */
+#define X86_BUG_MONITOR X86_BUG(12) /* "monitor" IPI required to wake up remote CPU */
+#define X86_BUG_AMD_E400 X86_BUG(13) /* "amd_e400" CPU is among the affected by Erratum 400 */
+#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* "cpu_meltdown" CPU is affected by meltdown attack and needs kernel page table isolation */
+#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* "spectre_v1" CPU is affected by Spectre variant 1 attack with conditional branches */
+#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* "spectre_v2" CPU is affected by Spectre variant 2 attack with indirect branches */
+#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* "spec_store_bypass" CPU is affected by speculative store bypass attack */
+#define X86_BUG_L1TF X86_BUG(18) /* "l1tf" CPU is affected by L1 Terminal Fault */
+#define X86_BUG_MDS X86_BUG(19) /* "mds" CPU is affected by Microarchitectural data sampling */
+#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* "msbds_only" CPU is only affected by the MSDBS variant of BUG_MDS */
+#define X86_BUG_SWAPGS X86_BUG(21) /* "swapgs" CPU is affected by speculation through SWAPGS */
+#define X86_BUG_TAA X86_BUG(22) /* "taa" CPU is affected by TSX Async Abort(TAA) */
+#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* "itlb_multihit" CPU may incur MCE during certain page attribute changes */
+#define X86_BUG_SRBDS X86_BUG(24) /* "srbds" CPU may leak RNG bits if not mitigated */
+#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* "mmio_stale_data" CPU is affected by Processor MMIO Stale Data vulnerabilities */
+#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* "mmio_unknown" CPU is too old and its MMIO Stale Data status is unknown */
+#define X86_BUG_RETBLEED X86_BUG(27) /* "retbleed" CPU is affected by RETBleed */
+#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* "eibrs_pbrsb" EIBRS is vulnerable to Post Barrier RSB Predictions */
+#define X86_BUG_SMT_RSB X86_BUG(29) /* "smt_rsb" CPU is vulnerable to Cross-Thread Return Address Predictions */
+#define X86_BUG_GDS X86_BUG(30) /* "gds" CPU is affected by Gather Data Sampling */
+#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* "tdx_pw_mce" CPU may incur #MC if non-TD software does partial write to TDX private memory */
/* BUG word 2 */
-#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */
-#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */
-#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */
-#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */
+#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* "srso" AMD SRSO bug */
+#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* "div0" AMD DIV0 speculation bug */
+#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */
+#define X86_BUG_BHI X86_BUG(1*32 + 3) /* "bhi" CPU is affected by Branch History Injection */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h
index 6b122a31da06..ca4243318aad 100644
--- a/arch/x86/include/asm/cpuid.h
+++ b/arch/x86/include/asm/cpuid.h
@@ -179,6 +179,7 @@ static __always_inline bool cpuid_function_is_indexed(u32 function)
case 0x1d:
case 0x1e:
case 0x1f:
+ case 0x24:
case 0x8000001d:
return true;
}
@@ -196,7 +197,12 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
for_each_possible_hypervisor_cpuid_base(base) {
cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
- if (!memcmp(sig, signature, 12) &&
+ /*
+ * This must not compile to "call memcmp" because it's called
+ * from PVH early boot code before instrumentation is set up
+ * and memcmp() itself may be instrumented.
+ */
+ if (!__builtin_memcmp(sig, signature, 12) &&
(leaves == 0 || ((eax - base) >= leaves)))
return base;
}
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 1dc600fa3ba5..521aad70e41b 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -229,7 +229,8 @@ static inline bool efi_is_native(void)
static inline void *efi64_zero_upper(void *p)
{
- ((u32 *)p)[1] = 0;
+ if (p)
+ ((u32 *)p)[1] = 0;
return p;
}
@@ -315,6 +316,10 @@ static inline u32 efi64_convert_status(efi_status_t status)
#define __efi64_argmap_clear_memory_attributes(protocol, phys, size, flags) \
((protocol), __efi64_split(phys), __efi64_split(size), __efi64_split(flags))
+/* EFI SMBIOS protocol */
+#define __efi64_argmap_get_next(protocol, smbioshandle, type, record, phandle) \
+ ((protocol), (smbioshandle), (type), efi64_zero_upper(record), \
+ efi64_zero_upper(phandle))
/*
* The macros below handle the plumbing for the argument mapping. To add a
* mapping for a specific EFI method, simply define a macro
@@ -384,24 +389,8 @@ static inline void efi_reserve_boot_services(void)
}
#endif /* CONFIG_EFI */
-#ifdef CONFIG_EFI_FAKE_MEMMAP
-extern void __init efi_fake_memmap_early(void);
-extern void __init efi_fake_memmap(void);
-#else
-static inline void efi_fake_memmap_early(void)
-{
-}
-
-static inline void efi_fake_memmap(void)
-{
-}
-#endif
-
extern int __init efi_memmap_alloc(unsigned int num_entries,
struct efi_memory_map_data *data);
-extern void __efi_memmap_free(u64 phys, unsigned long size,
- unsigned long flags);
-#define __efi_memmap_free __efi_memmap_free
extern int __init efi_memmap_install(struct efi_memory_map_data *data);
extern int __init efi_memmap_split_count(efi_memory_desc_t *md,
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 7e523bb3d2d3..77d20555e04d 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -8,6 +8,7 @@
#include <asm/nospec-branch.h>
#include <asm/io_bitmap.h>
#include <asm/fpu/api.h>
+#include <asm/fred.h>
/* Check that the stack and regs on entry from user mode are sane. */
static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
@@ -44,8 +45,7 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
}
#define arch_enter_from_user_mode arch_enter_from_user_mode
-static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
- unsigned long ti_work)
+static inline void arch_exit_work(unsigned long ti_work)
{
if (ti_work & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
@@ -56,6 +56,15 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
fpregs_assert_state_consistent();
if (unlikely(ti_work & _TIF_NEED_FPU_LOAD))
switch_fpu_return();
+}
+
+static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+ unsigned long ti_work)
+{
+ if (IS_ENABLED(CONFIG_X86_DEBUG_FPU) || unlikely(ti_work))
+ arch_exit_work(ti_work);
+
+ fred_update_rsp0();
#ifdef CONFIG_COMPAT
/*
@@ -73,19 +82,16 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
#endif
/*
- * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(),
- * but not enough for x86 stack utilization comfort. To keep
- * reasonable stack head room, reduce the maximum offset to 8 bits.
- *
- * The actual entropy will be further reduced by the compiler when
- * applying stack alignment constraints (see cc_stack_align4/8 in
+ * This value will get limited by KSTACK_OFFSET_MAX(), which is 10
+ * bits. The actual entropy will be further reduced by the compiler
+ * when applying stack alignment constraints (see cc_stack_align4/8 in
* arch/x86/Makefile), which will remove the 3 (x86_64) or 2 (ia32)
* low bits from any entropy chosen here.
*
- * Therefore, final stack offset entropy will be 5 (x86_64) or
- * 6 (ia32) bits.
+ * Therefore, final stack offset entropy will be 7 (x86_64) or
+ * 8 (ia32) bits.
*/
- choose_random_kstack_offset(rdtsc() & 0xFF);
+ choose_random_kstack_offset(rdtsc());
}
#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
index eeed395c3177..a0e0c6b50155 100644
--- a/arch/x86/include/asm/extable.h
+++ b/arch/x86/include/asm/extable.h
@@ -37,7 +37,6 @@ struct pt_regs;
extern int fixup_exception(struct pt_regs *regs, int trapnr,
unsigned long error_code, unsigned long fault_addr);
-extern int fixup_bug(struct pt_regs *regs, int trapnr);
extern int ex_get_fixup_type(unsigned long ip);
extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
index 7acf0383be80..906b0d5541e8 100644
--- a/arch/x86/include/asm/extable_fixup_types.h
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -36,7 +36,7 @@
#define EX_TYPE_DEFAULT 1
#define EX_TYPE_FAULT 2
#define EX_TYPE_UACCESS 3
-#define EX_TYPE_COPY 4
+/* unused, was: #define EX_TYPE_COPY 4 */
#define EX_TYPE_CLEAR_FS 5
#define EX_TYPE_FPU_RESTORE 6
#define EX_TYPE_BPF 7
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
deleted file mode 100644
index c3b9582de7ef..000000000000
--- a/arch/x86/include/asm/fb.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_FB_H
-#define _ASM_X86_FB_H
-
-#include <asm/page.h>
-
-struct fb_info;
-
-pgprot_t pgprot_framebuffer(pgprot_t prot,
- unsigned long vm_start, unsigned long vm_end,
- unsigned long offset);
-#define pgprot_framebuffer pgprot_framebuffer
-
-int fb_is_primary_device(struct fb_info *info);
-#define fb_is_primary_device fb_is_primary_device
-
-#include <asm-generic/fb.h>
-
-#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/fpu.h b/arch/x86/include/asm/fpu.h
new file mode 100644
index 000000000000..b2743fe19339
--- /dev/null
+++ b/arch/x86/include/asm/fpu.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef _ASM_X86_FPU_H
+#define _ASM_X86_FPU_H
+
+#include <asm/fpu/api.h>
+
+#define kernel_fpu_available() true
+
+#endif /* ! _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index a2be3aefff9f..f86ad3335529 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -143,6 +143,9 @@ extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfe
extern u64 xstate_get_guest_group_perm(void);
+extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+
+
/* KVM specific functions */
extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 611fa41711af..eccc75bc9c4f 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -29,7 +29,7 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long fpu__get_fpstate_size(void);
-extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size, u32 pkru);
extern void fpu__clear_user_states(struct fpu *fpu);
extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index ace9aa3b78a3..de16862bf230 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -2,8 +2,8 @@
/*
* FPU data structures:
*/
-#ifndef _ASM_X86_FPU_H
-#define _ASM_X86_FPU_H
+#ifndef _ASM_X86_FPU_TYPES_H
+#define _ASM_X86_FPU_TYPES_H
#include <asm/page_types.h>
@@ -591,9 +591,16 @@ struct fpu_state_config {
* even without XSAVE support, i.e. legacy features FP + SSE
*/
u64 legacy_features;
+ /*
+ * @independent_features:
+ *
+ * Features that are supported by XSAVES, but not managed as part of
+ * the FPU core, such as LBR
+ */
+ u64 independent_features;
};
/* FPU state configuration information */
extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg;
-#endif /* _ASM_X86_FPU_H */
+#endif /* _ASM_X86_FPU_TYPES_H */
diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index e86c7ba32435..25ca00bd70e8 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -36,6 +36,7 @@
#ifdef CONFIG_X86_FRED
#include <linux/kernel.h>
+#include <linux/sched/task_stack.h>
#include <asm/ptrace.h>
@@ -84,13 +85,33 @@ static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int
}
void cpu_init_fred_exceptions(void);
+void cpu_init_fred_rsps(void);
void fred_complete_exception_setup(void);
+DECLARE_PER_CPU(unsigned long, fred_rsp0);
+
+static __always_inline void fred_sync_rsp0(unsigned long rsp0)
+{
+ __this_cpu_write(fred_rsp0, rsp0);
+}
+
+static __always_inline void fred_update_rsp0(void)
+{
+ unsigned long rsp0 = (unsigned long) task_stack_page(current) + THREAD_SIZE;
+
+ if (cpu_feature_enabled(X86_FEATURE_FRED) && (__this_cpu_read(fred_rsp0) != rsp0)) {
+ wrmsrns(MSR_IA32_FRED_RSP0, rsp0);
+ __this_cpu_write(fred_rsp0, rsp0);
+ }
+}
#else /* CONFIG_X86_FRED */
static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; }
static inline void cpu_init_fred_exceptions(void) { }
+static inline void cpu_init_fred_rsps(void) { }
static inline void fred_complete_exception_setup(void) { }
-static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
+static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
+static inline void fred_sync_rsp0(unsigned long rsp0) { }
+static inline void fred_update_rsp0(void) { }
#endif /* CONFIG_X86_FRED */
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 897cf02c20b1..b4d719de2c84 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -2,6 +2,8 @@
#ifndef _ASM_X86_FTRACE_H
#define _ASM_X86_FTRACE_H
+#include <asm/ptrace.h>
+
#ifdef CONFIG_FUNCTION_TRACER
#ifndef CC_USING_FENTRY
# error Compiler does not support fentry?
@@ -20,8 +22,6 @@
#define ARCH_SUPPORTS_FTRACE_OPS 1
#endif
-#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
-
#ifndef __ASSEMBLY__
extern void __fentry__(void);
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index fbc7722b87d1..6ffa8b75f4cd 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -44,10 +44,16 @@ typedef struct {
unsigned int irq_hv_reenlightenment_count;
unsigned int hyperv_stimer0_count;
#endif
+#ifdef CONFIG_X86_POSTED_MSI
+ unsigned int posted_msi_notification_count;
+#endif
} ____cacheline_aligned irq_cpustat_t;
DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+#ifdef CONFIG_X86_POSTED_MSI
+DECLARE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
+#endif
#define __ARCH_IRQ_STAT
#define inc_irq_stat(member) this_cpu_inc(irq_stat.member)
@@ -63,7 +69,11 @@ extern u64 arch_irq_stat(void);
#define local_softirq_pending_ref pcpu_hot.softirq_pending
#if IS_ENABLED(CONFIG_KVM_INTEL)
-static inline void kvm_set_cpu_l1tf_flush_l1d(void)
+/*
+ * This function is called from noinstr interrupt contexts
+ * and must be inlined to not get instrumentation.
+ */
+static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void)
{
__this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
}
@@ -78,7 +88,7 @@ static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void)
return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d);
}
#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */
-static inline void kvm_set_cpu_l1tf_flush_l1d(void) { }
+static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) { }
#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */
#endif /* _ASM_X86_HARDIRQ_H */
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 4212c00c9708..9d69f3f8dbab 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -56,17 +56,6 @@ struct stat64 {
unsigned long long st_ino;
} __attribute__((packed));
-#define IA32_STACK_TOP IA32_PAGE_OFFSET
-
-#ifdef __KERNEL__
-struct linux_binprm;
-extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
- unsigned long stack_top, int exec_stack);
-struct mm_struct;
-extern void ia32_pick_mmap_layout(struct mm_struct *mm);
-
-#endif
-
extern bool __ia32_enabled;
static __always_inline bool ia32_enabled(void)
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
deleted file mode 100644
index aa065c94ccf5..000000000000
--- a/arch/x86/include/asm/ia32_unistd.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_IA32_UNISTD_H
-#define _ASM_X86_IA32_UNISTD_H
-
-/*
- * This file contains the system call numbers of the ia32 compat ABI,
- * this is for the kernel only.
- */
-#define __SYSCALL_ia32_NR(x) (x)
-#include <asm/unistd_32_ia32.h>
-
-#endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 749c7411d2f1..ad5c68f0509d 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -212,8 +212,8 @@ __visible noinstr void func(struct pt_regs *regs, \
irqentry_state_t state = irqentry_enter(regs); \
u32 vector = (u32)(u8)error_code; \
\
+ kvm_set_cpu_l1tf_flush_l1d(); \
instrumentation_begin(); \
- kvm_set_cpu_l1tf_flush_l1d(); \
run_irq_on_irqstack_cond(__##func, regs, vector); \
instrumentation_end(); \
irqentry_exit(regs, state); \
@@ -250,7 +250,6 @@ static void __##func(struct pt_regs *regs); \
\
static __always_inline void instr_##func(struct pt_regs *regs) \
{ \
- kvm_set_cpu_l1tf_flush_l1d(); \
run_sysvec_on_irqstack_cond(__##func, regs); \
} \
\
@@ -258,6 +257,7 @@ __visible noinstr void func(struct pt_regs *regs) \
{ \
irqentry_state_t state = irqentry_enter(regs); \
\
+ kvm_set_cpu_l1tf_flush_l1d(); \
instrumentation_begin(); \
instr_##func (regs); \
instrumentation_end(); \
@@ -288,7 +288,6 @@ static __always_inline void __##func(struct pt_regs *regs); \
static __always_inline void instr_##func(struct pt_regs *regs) \
{ \
__irq_enter_raw(); \
- kvm_set_cpu_l1tf_flush_l1d(); \
__##func (regs); \
__irq_exit_raw(); \
} \
@@ -297,6 +296,7 @@ __visible noinstr void func(struct pt_regs *regs) \
{ \
irqentry_state_t state = irqentry_enter(regs); \
\
+ kvm_set_cpu_l1tf_flush_l1d(); \
instrumentation_begin(); \
instr_##func (regs); \
instrumentation_end(); \
@@ -751,6 +751,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested
# define fred_sysvec_kvm_posted_intr_nested_ipi NULL
#endif
+# ifdef CONFIG_X86_POSTED_MSI
+DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification);
+#else
+# define fred_sysvec_posted_msi_notification NULL
+# endif
+
#if IS_ENABLED(CONFIG_HYPERV)
DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
index b56c5741581a..53e4015242b4 100644
--- a/arch/x86/include/asm/inat.h
+++ b/arch/x86/include/asm/inat.h
@@ -35,6 +35,8 @@
#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */
#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */
#define INAT_PFX_EVEX 15 /* EVEX prefix */
+/* x86-64 REX2 prefix */
+#define INAT_PFX_REX2 16 /* 0xD5 */
#define INAT_LSTPFX_MAX 3
#define INAT_LGCPFX_MAX 11
@@ -50,7 +52,7 @@
/* Legacy prefix */
#define INAT_PFX_OFFS 0
-#define INAT_PFX_BITS 4
+#define INAT_PFX_BITS 5
#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1)
#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS)
/* Escape opcodes */
@@ -77,6 +79,9 @@
#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5))
#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6))
#define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7))
+#define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8))
+#define INAT_REX2_VARIANT (1 << (INAT_FLAG_OFFS + 9))
+#define INAT_EVEX_SCALABLE (1 << (INAT_FLAG_OFFS + 10))
/* Attribute making macros for attribute tables */
#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS)
#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS)
@@ -128,6 +133,11 @@ static inline int inat_is_rex_prefix(insn_attr_t attr)
return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
}
+static inline int inat_is_rex2_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_REX2;
+}
+
static inline int inat_last_prefix_id(insn_attr_t attr)
{
if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
@@ -227,4 +237,9 @@ static inline int inat_must_evex(insn_attr_t attr)
{
return attr & INAT_EVEXONLY;
}
+
+static inline int inat_evex_scalable(insn_attr_t attr)
+{
+ return attr & INAT_EVEX_SCALABLE;
+}
#endif
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index cc9ccf61b6bd..14d72727d7ee 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -6,6 +6,7 @@
struct x86_mapping_info {
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
+ void (*free_pgt_page)(void *, void *); /* free buf for page table */
void *context; /* context for alloc_pgt_page */
unsigned long page_flag; /* page flag for PMD or PUD entry */
unsigned long offset; /* ident mapping offset */
@@ -16,4 +17,6 @@ struct x86_mapping_info {
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
unsigned long pstart, unsigned long pend);
+void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd);
+
#endif /* _ASM_X86_INIT_H */
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 1b29f58f730f..7152ea809e6a 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -112,10 +112,15 @@ struct insn {
#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
#define X86_SIB_BASE(sib) ((sib) & 0x07)
-#define X86_REX_W(rex) ((rex) & 8)
-#define X86_REX_R(rex) ((rex) & 4)
-#define X86_REX_X(rex) ((rex) & 2)
-#define X86_REX_B(rex) ((rex) & 1)
+#define X86_REX2_M(rex) ((rex) & 0x80) /* REX2 M0 */
+#define X86_REX2_R(rex) ((rex) & 0x40) /* REX2 R4 */
+#define X86_REX2_X(rex) ((rex) & 0x20) /* REX2 X4 */
+#define X86_REX2_B(rex) ((rex) & 0x10) /* REX2 B4 */
+
+#define X86_REX_W(rex) ((rex) & 8) /* REX or REX2 W */
+#define X86_REX_R(rex) ((rex) & 4) /* REX or REX2 R3 */
+#define X86_REX_X(rex) ((rex) & 2) /* REX or REX2 X3 */
+#define X86_REX_B(rex) ((rex) & 1) /* REX or REX2 B3 */
/* VEX bit flags */
#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */
@@ -161,6 +166,18 @@ static inline void insn_get_attribute(struct insn *insn)
/* Instruction uses RIP-relative addressing */
extern int insn_rip_relative(struct insn *insn);
+static inline int insn_is_rex2(struct insn *insn)
+{
+ if (!insn->prefixes.got)
+ insn_get_prefixes(insn);
+ return insn->rex_prefix.nbytes == 2;
+}
+
+static inline insn_byte_t insn_rex2_m_bit(struct insn *insn)
+{
+ return X86_REX2_M(insn->rex_prefix.bytes[1]);
+}
+
static inline int insn_is_avx(struct insn *insn)
{
if (!insn->prefixes.got)
@@ -198,6 +215,13 @@ static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
return X86_VEX_P(insn->vex_prefix.bytes[2]);
}
+static inline insn_byte_t insn_vex_w_bit(struct insn *insn)
+{
+ if (insn->vex_prefix.nbytes < 3)
+ return 0;
+ return X86_VEX_W(insn->vex_prefix.bytes[2]);
+}
+
/* Get the last prefix id from last prefix or VEX prefix */
static inline int insn_last_prefix_id(struct insn *insn)
{
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index d0941f4c2724..1a42f829667a 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -10,7 +10,7 @@
* that group keep the CPUID for the variants sorted by model number.
*
* The defined symbol names have the following form:
- * INTEL_FAM6{OPTFAMILY}_{MICROARCH}{OPTDIFF}
+ * INTEL_{OPTFAMILY}_{MICROARCH}{OPTDIFF}
* where:
* OPTFAMILY Describes the family of CPUs that this belongs to. Default
* is assumed to be "_CORE" (and should be omitted). Other values
@@ -40,137 +40,147 @@
* their own names :-(
*/
-/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */
-#define INTEL_FAM6_ANY X86_MODEL_ANY
+#define IFM(_fam, _model) VFM_MAKE(X86_VENDOR_INTEL, _fam, _model)
-#define INTEL_FAM6_CORE_YONAH 0x0E
+/* Wildcard match so X86_MATCH_VFM(ANY) works */
+#define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY)
-#define INTEL_FAM6_CORE2_MEROM 0x0F
-#define INTEL_FAM6_CORE2_MEROM_L 0x16
-#define INTEL_FAM6_CORE2_PENRYN 0x17
-#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D
+#define INTEL_PENTIUM_PRO IFM(6, 0x01)
-#define INTEL_FAM6_NEHALEM 0x1E
-#define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */
-#define INTEL_FAM6_NEHALEM_EP 0x1A
-#define INTEL_FAM6_NEHALEM_EX 0x2E
+#define INTEL_CORE_YONAH IFM(6, 0x0E)
-#define INTEL_FAM6_WESTMERE 0x25
-#define INTEL_FAM6_WESTMERE_EP 0x2C
-#define INTEL_FAM6_WESTMERE_EX 0x2F
+#define INTEL_CORE2_MEROM IFM(6, 0x0F)
+#define INTEL_CORE2_MEROM_L IFM(6, 0x16)
+#define INTEL_CORE2_PENRYN IFM(6, 0x17)
+#define INTEL_CORE2_DUNNINGTON IFM(6, 0x1D)
-#define INTEL_FAM6_SANDYBRIDGE 0x2A
-#define INTEL_FAM6_SANDYBRIDGE_X 0x2D
-#define INTEL_FAM6_IVYBRIDGE 0x3A
-#define INTEL_FAM6_IVYBRIDGE_X 0x3E
+#define INTEL_NEHALEM IFM(6, 0x1E)
+#define INTEL_NEHALEM_G IFM(6, 0x1F) /* Auburndale / Havendale */
+#define INTEL_NEHALEM_EP IFM(6, 0x1A)
+#define INTEL_NEHALEM_EX IFM(6, 0x2E)
-#define INTEL_FAM6_HASWELL 0x3C
-#define INTEL_FAM6_HASWELL_X 0x3F
-#define INTEL_FAM6_HASWELL_L 0x45
-#define INTEL_FAM6_HASWELL_G 0x46
+#define INTEL_WESTMERE IFM(6, 0x25)
+#define INTEL_WESTMERE_EP IFM(6, 0x2C)
+#define INTEL_WESTMERE_EX IFM(6, 0x2F)
-#define INTEL_FAM6_BROADWELL 0x3D
-#define INTEL_FAM6_BROADWELL_G 0x47
-#define INTEL_FAM6_BROADWELL_X 0x4F
-#define INTEL_FAM6_BROADWELL_D 0x56
+#define INTEL_SANDYBRIDGE IFM(6, 0x2A)
+#define INTEL_SANDYBRIDGE_X IFM(6, 0x2D)
+#define INTEL_IVYBRIDGE IFM(6, 0x3A)
+#define INTEL_IVYBRIDGE_X IFM(6, 0x3E)
-#define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */
-#define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */
-#define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */
+#define INTEL_HASWELL IFM(6, 0x3C)
+#define INTEL_HASWELL_X IFM(6, 0x3F)
+#define INTEL_HASWELL_L IFM(6, 0x45)
+#define INTEL_HASWELL_G IFM(6, 0x46)
+
+#define INTEL_BROADWELL IFM(6, 0x3D)
+#define INTEL_BROADWELL_G IFM(6, 0x47)
+#define INTEL_BROADWELL_X IFM(6, 0x4F)
+#define INTEL_BROADWELL_D IFM(6, 0x56)
+
+#define INTEL_SKYLAKE_L IFM(6, 0x4E) /* Sky Lake */
+#define INTEL_SKYLAKE IFM(6, 0x5E) /* Sky Lake */
+#define INTEL_SKYLAKE_X IFM(6, 0x55) /* Sky Lake */
/* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */
/* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */
-#define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */
+#define INTEL_KABYLAKE_L IFM(6, 0x8E) /* Sky Lake */
/* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */
/* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */
/* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */
-#define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */
+#define INTEL_KABYLAKE IFM(6, 0x9E) /* Sky Lake */
/* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */
-#define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */
-#define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */
+#define INTEL_COMETLAKE IFM(6, 0xA5) /* Sky Lake */
+#define INTEL_COMETLAKE_L IFM(6, 0xA6) /* Sky Lake */
-#define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */
+#define INTEL_CANNONLAKE_L IFM(6, 0x66) /* Palm Cove */
-#define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */
-#define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */
-#define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */
-#define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */
-#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */
+#define INTEL_ICELAKE_X IFM(6, 0x6A) /* Sunny Cove */
+#define INTEL_ICELAKE_D IFM(6, 0x6C) /* Sunny Cove */
+#define INTEL_ICELAKE IFM(6, 0x7D) /* Sunny Cove */
+#define INTEL_ICELAKE_L IFM(6, 0x7E) /* Sunny Cove */
+#define INTEL_ICELAKE_NNPI IFM(6, 0x9D) /* Sunny Cove */
-#define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */
+#define INTEL_ROCKETLAKE IFM(6, 0xA7) /* Cypress Cove */
-#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */
-#define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */
+#define INTEL_TIGERLAKE_L IFM(6, 0x8C) /* Willow Cove */
+#define INTEL_TIGERLAKE IFM(6, 0x8D) /* Willow Cove */
-#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */
+#define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */
-#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF
+#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF)
-#define INTEL_FAM6_GRANITERAPIDS_X 0xAD
-#define INTEL_FAM6_GRANITERAPIDS_D 0xAE
+#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD)
+#define INTEL_GRANITERAPIDS_D IFM(6, 0xAE)
/* "Hybrid" Processors (P-Core/E-Core) */
-#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */
+#define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */
-#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
-#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
+#define INTEL_ALDERLAKE IFM(6, 0x97) /* Golden Cove / Gracemont */
+#define INTEL_ALDERLAKE_L IFM(6, 0x9A) /* Golden Cove / Gracemont */
-#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */
-#define INTEL_FAM6_RAPTORLAKE_P 0xBA
-#define INTEL_FAM6_RAPTORLAKE_S 0xBF
+#define INTEL_RAPTORLAKE IFM(6, 0xB7) /* Raptor Cove / Enhanced Gracemont */
+#define INTEL_RAPTORLAKE_P IFM(6, 0xBA)
+#define INTEL_RAPTORLAKE_S IFM(6, 0xBF)
-#define INTEL_FAM6_METEORLAKE 0xAC
-#define INTEL_FAM6_METEORLAKE_L 0xAA
+#define INTEL_METEORLAKE IFM(6, 0xAC)
+#define INTEL_METEORLAKE_L IFM(6, 0xAA)
-#define INTEL_FAM6_ARROWLAKE_H 0xC5
-#define INTEL_FAM6_ARROWLAKE 0xC6
-#define INTEL_FAM6_ARROWLAKE_U 0xB5
+#define INTEL_ARROWLAKE_H IFM(6, 0xC5)
+#define INTEL_ARROWLAKE IFM(6, 0xC6)
+#define INTEL_ARROWLAKE_U IFM(6, 0xB5)
-#define INTEL_FAM6_LUNARLAKE_M 0xBD
+#define INTEL_LUNARLAKE_M IFM(6, 0xBD)
+
+#define INTEL_PANTHERLAKE_L IFM(6, 0xCC)
/* "Small Core" Processors (Atom/E-Core) */
-#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
-#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */
+#define INTEL_ATOM_BONNELL IFM(6, 0x1C) /* Diamondville, Pineview */
+#define INTEL_ATOM_BONNELL_MID IFM(6, 0x26) /* Silverthorne, Lincroft */
-#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */
-#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */
-#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */
+#define INTEL_ATOM_SALTWELL IFM(6, 0x36) /* Cedarview */
+#define INTEL_ATOM_SALTWELL_MID IFM(6, 0x27) /* Penwell */
+#define INTEL_ATOM_SALTWELL_TABLET IFM(6, 0x35) /* Cloverview */
-#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */
-#define INTEL_FAM6_ATOM_SILVERMONT_D 0x4D /* Avaton, Rangely */
-#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */
+#define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */
+#define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */
+#define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */
-#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */
-#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */
-#define INTEL_FAM6_ATOM_AIRMONT_NP 0x75 /* Lightning Mountain */
+#define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */
+#define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */
+#define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */
-#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
-#define INTEL_FAM6_ATOM_GOLDMONT_D 0x5F /* Denverton */
+#define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */
+#define INTEL_ATOM_GOLDMONT_D IFM(6, 0x5F) /* Denverton */
/* Note: the micro-architecture is "Goldmont Plus" */
-#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
+#define INTEL_ATOM_GOLDMONT_PLUS IFM(6, 0x7A) /* Gemini Lake */
-#define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */
-#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */
-#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */
+#define INTEL_ATOM_TREMONT_D IFM(6, 0x86) /* Jacobsville */
+#define INTEL_ATOM_TREMONT IFM(6, 0x96) /* Elkhart Lake */
+#define INTEL_ATOM_TREMONT_L IFM(6, 0x9C) /* Jasper Lake */
-#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */
+#define INTEL_ATOM_GRACEMONT IFM(6, 0xBE) /* Alderlake N */
-#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */
-#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */
+#define INTEL_ATOM_CRESTMONT_X IFM(6, 0xAF) /* Sierra Forest */
+#define INTEL_ATOM_CRESTMONT IFM(6, 0xB6) /* Grand Ridge */
-#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */
+#define INTEL_ATOM_DARKMONT_X IFM(6, 0xDD) /* Clearwater Forest */
/* Xeon Phi */
-#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */
-#define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */
+#define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */
+#define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */
/* Family 5 */
#define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */
+#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */
+
+/* Family 19 */
+#define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */
#endif /* _ASM_X86_INTEL_FAMILY_H */
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
index 2f9eeb5c3069..5dbeac48a5b9 100644
--- a/arch/x86/include/asm/intel_ds.h
+++ b/arch/x86/include/asm/intel_ds.h
@@ -9,6 +9,7 @@
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS_FMT4 8
#define MAX_PEBS_EVENTS 32
+#define MAX_PEBS_EVENTS_MASK GENMASK_ULL(MAX_PEBS_EVENTS - 1, 0)
#define MAX_FIXED_PEBS_EVENTS 16
/*
diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h
deleted file mode 100644
index 994638ef171b..000000000000
--- a/arch/x86/include/asm/intel_pconfig.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef _ASM_X86_INTEL_PCONFIG_H
-#define _ASM_X86_INTEL_PCONFIG_H
-
-#include <asm/asm.h>
-#include <asm/processor.h>
-
-enum pconfig_target {
- INVALID_TARGET = 0,
- MKTME_TARGET = 1,
- PCONFIG_TARGET_NR
-};
-
-int pconfig_target_supported(enum pconfig_target target);
-
-enum pconfig_leaf {
- MKTME_KEY_PROGRAM = 0,
- PCONFIG_LEAF_INVALID,
-};
-
-#define PCONFIG ".byte 0x0f, 0x01, 0xc5"
-
-/* Defines and structure for MKTME_KEY_PROGRAM of PCONFIG instruction */
-
-/* mktme_key_program::keyid_ctrl COMMAND, bits [7:0] */
-#define MKTME_KEYID_SET_KEY_DIRECT 0
-#define MKTME_KEYID_SET_KEY_RANDOM 1
-#define MKTME_KEYID_CLEAR_KEY 2
-#define MKTME_KEYID_NO_ENCRYPT 3
-
-/* mktme_key_program::keyid_ctrl ENC_ALG, bits [23:8] */
-#define MKTME_AES_XTS_128 (1 << 8)
-
-/* Return codes from the PCONFIG MKTME_KEY_PROGRAM */
-#define MKTME_PROG_SUCCESS 0
-#define MKTME_INVALID_PROG_CMD 1
-#define MKTME_ENTROPY_ERROR 2
-#define MKTME_INVALID_KEYID 3
-#define MKTME_INVALID_ENC_ALG 4
-#define MKTME_DEVICE_BUSY 5
-
-/* Hardware requires the structure to be 256 byte aligned. Otherwise #GP(0). */
-struct mktme_key_program {
- u16 keyid;
- u32 keyid_ctrl;
- u8 __rsvd[58];
- u8 key_field_1[64];
- u8 key_field_2[64];
-} __packed __aligned(256);
-
-static inline int mktme_key_program(struct mktme_key_program *key_program)
-{
- unsigned long rax = MKTME_KEY_PROGRAM;
-
- if (!pconfig_target_supported(MKTME_TARGET))
- return -ENXIO;
-
- asm volatile(PCONFIG
- : "=a" (rax), "=b" (key_program)
- : "0" (rax), "1" (key_program)
- : "memory", "cc");
-
- return rax;
-}
-
-#endif /* _ASM_X86_INTEL_PCONFIG_H */
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
deleted file mode 100644
index 8537f597d20a..000000000000
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_INTEL_SCU_IPC_H_
-#define _ASM_X86_INTEL_SCU_IPC_H_
-
-#include <linux/ioport.h>
-
-struct device;
-struct intel_scu_ipc_dev;
-
-/**
- * struct intel_scu_ipc_data - Data used to configure SCU IPC
- * @mem: Base address of SCU IPC MMIO registers
- * @irq: The IRQ number used for SCU (optional)
- */
-struct intel_scu_ipc_data {
- struct resource mem;
- int irq;
-};
-
-struct intel_scu_ipc_dev *
-__intel_scu_ipc_register(struct device *parent,
- const struct intel_scu_ipc_data *scu_data,
- struct module *owner);
-
-#define intel_scu_ipc_register(parent, scu_data) \
- __intel_scu_ipc_register(parent, scu_data, THIS_MODULE)
-
-void intel_scu_ipc_unregister(struct intel_scu_ipc_dev *scu);
-
-struct intel_scu_ipc_dev *
-__devm_intel_scu_ipc_register(struct device *parent,
- const struct intel_scu_ipc_data *scu_data,
- struct module *owner);
-
-#define devm_intel_scu_ipc_register(parent, scu_data) \
- __devm_intel_scu_ipc_register(parent, scu_data, THIS_MODULE)
-
-struct intel_scu_ipc_dev *intel_scu_ipc_dev_get(void);
-void intel_scu_ipc_dev_put(struct intel_scu_ipc_dev *scu);
-struct intel_scu_ipc_dev *devm_intel_scu_ipc_dev_get(struct device *dev);
-
-int intel_scu_ipc_dev_ioread8(struct intel_scu_ipc_dev *scu, u16 addr,
- u8 *data);
-int intel_scu_ipc_dev_iowrite8(struct intel_scu_ipc_dev *scu, u16 addr,
- u8 data);
-int intel_scu_ipc_dev_readv(struct intel_scu_ipc_dev *scu, u16 *addr,
- u8 *data, size_t len);
-int intel_scu_ipc_dev_writev(struct intel_scu_ipc_dev *scu, u16 *addr,
- u8 *data, size_t len);
-
-int intel_scu_ipc_dev_update(struct intel_scu_ipc_dev *scu, u16 addr,
- u8 data, u8 mask);
-
-int intel_scu_ipc_dev_simple_command(struct intel_scu_ipc_dev *scu, int cmd,
- int sub);
-int intel_scu_ipc_dev_command_with_size(struct intel_scu_ipc_dev *scu, int cmd,
- int sub, const void *in, size_t inlen,
- size_t size, void *out, size_t outlen);
-
-static inline int intel_scu_ipc_dev_command(struct intel_scu_ipc_dev *scu, int cmd,
- int sub, const void *in, size_t inlen,
- void *out, size_t outlen)
-{
- return intel_scu_ipc_dev_command_with_size(scu, cmd, sub, in, inlen,
- inlen, out, outlen);
-}
-
-#endif
diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h
index 8046e70dfd7c..43b7657febca 100644
--- a/arch/x86/include/asm/intel_telemetry.h
+++ b/arch/x86/include/asm/intel_telemetry.h
@@ -10,7 +10,7 @@
#define TELEM_MAX_EVENTS_SRAM 28
#define TELEM_MAX_OS_ALLOCATED_EVENTS 20
-#include <asm/intel_scu_ipc.h>
+#include <linux/platform_data/x86/intel_scu_ipc.h>
enum telemetry_unit {
TELEM_PSS = 0,
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 294cd2a40818..1d60427379c9 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -42,6 +42,7 @@
#include <asm/early_ioremap.h>
#include <asm/pgtable_types.h>
#include <asm/shared/io.h>
+#include <asm/special_insns.h>
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
@@ -209,6 +210,23 @@ void memset_io(volatile void __iomem *, int, size_t);
#define memcpy_toio memcpy_toio
#define memset_io memset_io
+#ifdef CONFIG_X86_64
+/*
+ * Commit 0f07496144c2 ("[PATCH] Add faster __iowrite32_copy routine for
+ * x86_64") says that circa 2006 rep movsl is noticeably faster than a copy
+ * loop.
+ */
+static inline void __iowrite32_copy(void __iomem *to, const void *from,
+ size_t count)
+{
+ asm volatile("rep ; movsl"
+ : "=&c"(count), "=&D"(to), "=&S"(from)
+ : "0"(count), "1"(to), "2"(from)
+ : "memory");
+}
+#define __iowrite32_copy __iowrite32_copy
+#endif
+
/*
* ISA space is 'always mapped' on a typical x86 system, no need to
* explicitly ioremap() it. The fact that the ISA IO space is mapped
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 7a2ed154a5e1..5036f13ab69f 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -50,6 +50,13 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
return x86_vector_domain;
}
+extern bool enable_posted_msi;
+
+static inline bool posted_msi_supported(void)
+{
+ return enable_posted_msi && irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
#else /* CONFIG_IRQ_REMAP */
static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 798183867d78..b71ad173f877 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -100,7 +100,7 @@
}
#define ASM_CALL_ARG0 \
- "call %P[__func] \n" \
+ "call %c[__func] \n" \
ASM_REACHABLE
#define ASM_CALL_ARG1 \
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index d18bfb238f66..47051871b436 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -18,8 +18,8 @@
* Vectors 0 ... 31 : system traps and exceptions - hardcoded events
* Vectors 32 ... 127 : device interrupts
* Vector 128 : legacy int80 syscall interface
- * Vectors 129 ... LOCAL_TIMER_VECTOR-1
- * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts
+ * Vectors 129 ... FIRST_SYSTEM_VECTOR-1 : device interrupts
+ * Vectors FIRST_SYSTEM_VECTOR ... 255 : special interrupts
*
* 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
*
@@ -97,10 +97,16 @@
#define LOCAL_TIMER_VECTOR 0xec
+/*
+ * Posted interrupt notification vector for all device MSIs delivered to
+ * the host kernel.
+ */
+#define POSTED_MSI_NOTIFICATION_VECTOR 0xeb
+
#define NR_VECTORS 256
#ifdef CONFIG_X86_LOCAL_APIC
-#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR
+#define FIRST_SYSTEM_VECTOR POSTED_MSI_NOTIFICATION_VECTOR
#else
#define FIRST_SYSTEM_VECTOR NR_VECTORS
#endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 8c5ae649d2df..cf7fc2b8e3ce 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -54,6 +54,26 @@ static __always_inline void native_halt(void)
asm volatile("hlt": : :"memory");
}
+static __always_inline int native_irqs_disabled_flags(unsigned long flags)
+{
+ return !(flags & X86_EFLAGS_IF);
+}
+
+static __always_inline unsigned long native_local_irq_save(void)
+{
+ unsigned long flags = native_save_fl();
+
+ native_irq_disable();
+
+ return flags;
+}
+
+static __always_inline void native_local_irq_restore(unsigned long flags)
+{
+ if (!native_irqs_disabled_flags(flags))
+ native_irq_enable();
+}
+
#endif
#ifdef CONFIG_PARAVIRT_XXL
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 91ca9a9ee3a2..ae5482a2f0ca 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -207,18 +207,11 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image);
extern void kdump_nmi_shootdown_cpus(void);
#ifdef CONFIG_CRASH_HOTPLUG
-void arch_crash_handle_hotplug_event(struct kimage *image);
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg);
#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
-#ifdef CONFIG_HOTPLUG_CPU
-int arch_crash_hotplug_cpu_support(void);
-#define crash_hotplug_cpu_support arch_crash_hotplug_cpu_support
-#endif
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_crash_hotplug_memory_support(void);
-#define crash_hotplug_memory_support arch_crash_hotplug_memory_support
-#endif
+int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags);
+#define arch_crash_hotplug_support arch_crash_hotplug_support
unsigned int arch_crash_get_elfcorehdr_size(void);
#define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 110d7f29ca9a..861d080ed4c6 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -9,14 +9,13 @@ BUILD_BUG_ON(1)
* "static_call_update()" calls.
*
* KVM_X86_OP_OPTIONAL() can be used for those functions that can have
- * a NULL definition, for example if "static_call_cond()" will be used
- * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise
+ * a NULL definition. KVM_X86_OP_OPTIONAL_RET0() can be used likewise
* to make a definition optional, but in this case the default will
* be __static_call_return0.
*/
KVM_X86_OP(check_processor_compatibility)
-KVM_X86_OP(hardware_enable)
-KVM_X86_OP(hardware_disable)
+KVM_X86_OP(enable_virtualization_cpu)
+KVM_X86_OP(disable_virtualization_cpu)
KVM_X86_OP(hardware_unsetup)
KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)
@@ -85,7 +84,6 @@ KVM_X86_OP_OPTIONAL(update_cr8_intercept)
KVM_X86_OP(refresh_apicv_exec_ctrl)
KVM_X86_OP_OPTIONAL(hwapic_irr_update)
KVM_X86_OP_OPTIONAL(hwapic_isr_update)
-KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt)
KVM_X86_OP_OPTIONAL(load_eoi_exitmap)
KVM_X86_OP_OPTIONAL(set_virtual_apic_mode)
KVM_X86_OP_OPTIONAL(set_apic_access_page_addr)
@@ -103,7 +101,6 @@ KVM_X86_OP(write_tsc_multiplier)
KVM_X86_OP(get_exit_info)
KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
-KVM_X86_OP(sched_in)
KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
KVM_X86_OP_OPTIONAL(vcpu_unblocking)
@@ -121,13 +118,14 @@ KVM_X86_OP(enter_smm)
KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
#endif
+KVM_X86_OP_OPTIONAL(dev_get_attr)
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
-KVM_X86_OP(get_msr_feature)
+KVM_X86_OP(get_feature_msr)
KVM_X86_OP(check_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
@@ -138,6 +136,9 @@ KVM_X86_OP(vcpu_deliver_sipi_vector)
KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
KVM_X86_OP_OPTIONAL(get_untagged_addr)
KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
+KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
+KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level)
+KVM_X86_OP_OPTIONAL(gmem_invalidate)
#undef KVM_X86_OP
#undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
index f852b13aeefe..9159bf1a4730 100644
--- a/arch/x86/include/asm/kvm-x86-pmu-ops.h
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -9,8 +9,7 @@ BUILD_BUG_ON(1)
* "static_call_update()" calls.
*
* KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have
- * a NULL definition, for example if "static_call_cond()" will be used
- * at the call sites.
+ * a NULL definition.
*/
KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
KVM_X86_PMU_OP(msr_idx_to_pmc)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6efd1497b026..6d9f763a7bb9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -36,6 +36,7 @@
#include <asm/kvm_page_track.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/hyperv-tlfs.h>
+#include <asm/reboot.h>
#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
@@ -121,6 +122,7 @@
KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_HV_TLB_FLUSH \
KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -159,7 +161,6 @@
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 256
-#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
#define ASYNC_PF_PER_VCPU 64
@@ -211,6 +212,7 @@ enum exit_fastpath_completion {
EXIT_FASTPATH_NONE,
EXIT_FASTPATH_REENTER_GUEST,
EXIT_FASTPATH_EXIT_HANDLED,
+ EXIT_FASTPATH_EXIT_USERSPACE,
};
typedef enum exit_fastpath_completion fastpath_t;
@@ -254,32 +256,31 @@ enum x86_intercept_stage;
KVM_GUESTDBG_INJECT_DB | \
KVM_GUESTDBG_BLOCKIRQ)
+#define PFERR_PRESENT_MASK BIT(0)
+#define PFERR_WRITE_MASK BIT(1)
+#define PFERR_USER_MASK BIT(2)
+#define PFERR_RSVD_MASK BIT(3)
+#define PFERR_FETCH_MASK BIT(4)
+#define PFERR_PK_MASK BIT(5)
+#define PFERR_SGX_MASK BIT(15)
+#define PFERR_GUEST_RMP_MASK BIT_ULL(31)
+#define PFERR_GUEST_FINAL_MASK BIT_ULL(32)
+#define PFERR_GUEST_PAGE_MASK BIT_ULL(33)
+#define PFERR_GUEST_ENC_MASK BIT_ULL(34)
+#define PFERR_GUEST_SIZEM_MASK BIT_ULL(35)
+#define PFERR_GUEST_VMPL_MASK BIT_ULL(36)
-#define PFERR_PRESENT_BIT 0
-#define PFERR_WRITE_BIT 1
-#define PFERR_USER_BIT 2
-#define PFERR_RSVD_BIT 3
-#define PFERR_FETCH_BIT 4
-#define PFERR_PK_BIT 5
-#define PFERR_SGX_BIT 15
-#define PFERR_GUEST_FINAL_BIT 32
-#define PFERR_GUEST_PAGE_BIT 33
-#define PFERR_IMPLICIT_ACCESS_BIT 48
-
-#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT)
-#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT)
-#define PFERR_USER_MASK BIT(PFERR_USER_BIT)
-#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT)
-#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT)
-#define PFERR_PK_MASK BIT(PFERR_PK_BIT)
-#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT)
-#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT)
-#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT)
-#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
-
-#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
- PFERR_WRITE_MASK | \
- PFERR_PRESENT_MASK)
+/*
+ * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks
+ * when emulating instructions that triggers implicit access.
+ */
+#define PFERR_IMPLICIT_ACCESS BIT_ULL(48)
+/*
+ * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred
+ * when the guest was accessing private memory.
+ */
+#define PFERR_PRIVATE_ACCESS BIT_ULL(49)
+#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
@@ -530,12 +531,16 @@ struct kvm_pmc {
};
/* More counters may conflict with other existing Architectural MSRs */
-#define KVM_INTEL_PMC_MAX_GENERIC 8
-#define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
-#define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
-#define KVM_PMC_MAX_FIXED 3
-#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
-#define KVM_AMD_PMC_MAX_GENERIC 6
+#define KVM_MAX(a, b) ((a) >= (b) ? (a) : (b))
+#define KVM_MAX_NR_INTEL_GP_COUNTERS 8
+#define KVM_MAX_NR_AMD_GP_COUNTERS 6
+#define KVM_MAX_NR_GP_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \
+ KVM_MAX_NR_AMD_GP_COUNTERS)
+
+#define KVM_MAX_NR_INTEL_FIXED_COUTNERS 3
+#define KVM_MAX_NR_AMD_FIXED_COUTNERS 0
+#define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUTNERS, \
+ KVM_MAX_NR_AMD_FIXED_COUTNERS)
struct kvm_pmu {
u8 version;
@@ -543,16 +548,16 @@ struct kvm_pmu {
unsigned nr_arch_fixed_counters;
unsigned available_event_types;
u64 fixed_ctr_ctrl;
- u64 fixed_ctr_ctrl_mask;
+ u64 fixed_ctr_ctrl_rsvd;
u64 global_ctrl;
u64 global_status;
u64 counter_bitmask[2];
- u64 global_ctrl_mask;
- u64 global_status_mask;
+ u64 global_ctrl_rsvd;
+ u64 global_status_rsvd;
u64 reserved_bits;
u64 raw_event_mask;
- struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
- struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
+ struct kvm_pmc gp_counters[KVM_MAX_NR_GP_COUNTERS];
+ struct kvm_pmc fixed_counters[KVM_MAX_NR_FIXED_COUNTERS];
/*
* Overlay the bitmap with a 64-bit atomic so that all bits can be
@@ -568,9 +573,9 @@ struct kvm_pmu {
u64 ds_area;
u64 pebs_enable;
- u64 pebs_enable_mask;
+ u64 pebs_enable_rsvd;
u64 pebs_data_cfg;
- u64 pebs_data_cfg_mask;
+ u64 pebs_data_cfg_rsvd;
/*
* If a guest counter is cross-mapped to host counter with different
@@ -601,18 +606,12 @@ enum {
KVM_DEBUGREG_WONT_EXIT = 2,
};
-struct kvm_mtrr_range {
- u64 base;
- u64 mask;
- struct list_head node;
-};
-
struct kvm_mtrr {
- struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
- mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
+ u64 var[KVM_NR_VAR_MTRR * 2];
+ u64 fixed_64k;
+ u64 fixed_16k[2];
+ u64 fixed_4k[8];
u64 deftype;
-
- struct list_head head;
};
/* Hyper-V SynIC timer */
@@ -994,9 +993,6 @@ struct kvm_vcpu_arch {
u64 msr_kvm_poll_control;
- /* set at EPT violation at this point */
- unsigned long exit_qualification;
-
/* pv related host specific info */
struct {
bool pv_unhalted;
@@ -1207,7 +1203,7 @@ enum kvm_apicv_inhibit {
* APIC acceleration is disabled by a module parameter
* and/or not supported in hardware.
*/
- APICV_INHIBIT_REASON_DISABLE,
+ APICV_INHIBIT_REASON_DISABLED,
/*
* APIC acceleration is inhibited because AutoEOI feature is
@@ -1277,15 +1273,37 @@ enum kvm_apicv_inhibit {
* mapping between logical ID and vCPU.
*/
APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
+
+ NR_APICV_INHIBIT_REASONS,
};
+#define __APICV_INHIBIT_REASON(reason) \
+ { BIT(APICV_INHIBIT_REASON_##reason), #reason }
+
+#define APICV_INHIBIT_REASONS \
+ __APICV_INHIBIT_REASON(DISABLED), \
+ __APICV_INHIBIT_REASON(HYPERV), \
+ __APICV_INHIBIT_REASON(ABSENT), \
+ __APICV_INHIBIT_REASON(BLOCKIRQ), \
+ __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \
+ __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \
+ __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \
+ __APICV_INHIBIT_REASON(NESTED), \
+ __APICV_INHIBIT_REASON(IRQWIN), \
+ __APICV_INHIBIT_REASON(PIT_REINJ), \
+ __APICV_INHIBIT_REASON(SEV), \
+ __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
+
struct kvm_arch {
- unsigned long vm_type;
unsigned long n_used_mmu_pages;
unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
u8 mmu_valid_gen;
+ u8 vm_type;
+ bool has_private_mem;
+ bool has_protected_state;
+ bool pre_fault_allowed;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
@@ -1312,6 +1330,8 @@ struct kvm_arch {
*/
spinlock_t mmu_unsync_pages_lock;
+ u64 shadow_mmio_value;
+
struct iommu_domain *iommu_domain;
bool iommu_noncoherent;
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
@@ -1360,6 +1380,7 @@ struct kvm_arch {
u32 default_tsc_khz;
bool user_set_tsc;
+ u64 apic_bus_cycle_ns;
seqcount_raw_spinlock_t pvclock_sc;
bool use_master_clock;
@@ -1606,8 +1627,10 @@ struct kvm_x86_ops {
int (*check_processor_compatibility)(void);
- int (*hardware_enable)(void);
- void (*hardware_disable)(void);
+ int (*enable_virtualization_cpu)(void);
+ void (*disable_virtualization_cpu)(void);
+ cpu_emergency_virt_cb *emergency_disable_virtualization_cpu;
+
void (*hardware_unsetup)(void);
bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
@@ -1704,13 +1727,13 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
+
+ const bool x2apic_icr_is_split;
const unsigned long required_apicv_inhibits;
bool allow_apicv_in_x2apic_without_x2apic_virtualization;
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(int isr);
- bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
@@ -1745,8 +1768,6 @@ struct kvm_x86_ops {
struct x86_exception *exception);
void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
- void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);
-
/*
* Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero
* value indicates CPU dirty logging is unsupported or disabled.
@@ -1779,6 +1800,7 @@ struct kvm_x86_ops {
void (*enable_smi_window)(struct kvm_vcpu *vcpu);
#endif
+ int (*dev_get_attr)(u32 group, u64 attr, u64 *val);
int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
@@ -1786,7 +1808,7 @@ struct kvm_x86_ops {
int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
void (*guest_memory_reclaimed)(struct kvm *kvm);
- int (*get_msr_feature)(struct kvm_msr_entry *entry);
+ int (*get_feature_msr)(u32 msr, u64 *data);
int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len);
@@ -1807,6 +1829,9 @@ struct kvm_x86_ops {
gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
+ int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
+ void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
+ int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn);
};
struct kvm_x86_nested_ops {
@@ -1814,7 +1839,7 @@ struct kvm_x86_nested_ops {
bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector,
u32 error_code);
int (*check_events)(struct kvm_vcpu *vcpu);
- bool (*has_events)(struct kvm_vcpu *vcpu);
+ bool (*has_events)(struct kvm_vcpu *vcpu, bool for_injection);
void (*triple_fault)(struct kvm_vcpu *vcpu);
int (*get_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
@@ -1844,14 +1869,17 @@ struct kvm_arch_async_pf {
gfn_t gfn;
unsigned long cr3;
bool direct_map;
+ u64 error_code;
};
extern u32 __read_mostly kvm_nr_uret_msrs;
-extern u64 __read_mostly host_efer;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
extern struct kvm_x86_ops kvm_x86_ops;
+#define kvm_x86_call(func) static_call(kvm_x86_##func)
+#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func)
+
#define KVM_X86_OP(func) \
DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
#define KVM_X86_OP_OPTIONAL KVM_X86_OP
@@ -1875,7 +1903,7 @@ void kvm_arch_free_vm(struct kvm *kvm);
static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
if (kvm_x86_ops.flush_remote_tlbs &&
- !static_call(kvm_x86_flush_remote_tlbs)(kvm))
+ !kvm_x86_call(flush_remote_tlbs)(kvm))
return 0;
else
return -ENOTSUPP;
@@ -1888,7 +1916,7 @@ static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn,
if (!kvm_x86_ops.flush_remote_tlbs_range)
return -EOPNOTSUPP;
- return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
+ return kvm_x86_call(flush_remote_tlbs_range)(kvm, gfn, nr_pages);
}
#endif /* CONFIG_HYPERV */
@@ -1933,6 +1961,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -2033,6 +2062,8 @@ void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);
void kvm_enable_efer_bits(u64);
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
+int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data);
+int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data);
int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
@@ -2109,7 +2140,15 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
void kvm_update_dr7(struct kvm_vcpu *vcpu);
-int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
+bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool always_retry);
+
+static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu,
+ gpa_t cr2_or_gpa)
+{
+ return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false);
+}
+
void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free);
void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
@@ -2140,10 +2179,15 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
}
+unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ int op_64_bit, int cpl);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len);
+void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
u64 addr, unsigned long roots);
@@ -2153,12 +2197,15 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
+
#ifdef CONFIG_KVM_PRIVATE_MEM
-#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM)
+#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
#else
#define kvm_arch_has_private_mem(kvm) false
#endif
+#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state)
+
static inline u16 kvm_read_ldt(void)
{
u16 ldt;
@@ -2219,6 +2266,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_cpu_has_extint(struct kvm_vcpu *v);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
+int kvm_cpu_get_extint(struct kvm_vcpu *v);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
@@ -2280,12 +2328,12 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
{
- static_call_cond(kvm_x86_vcpu_blocking)(vcpu);
+ kvm_x86_call(vcpu_blocking)(vcpu);
}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- static_call_cond(kvm_x86_vcpu_unblocking)(vcpu);
+ kvm_x86_call(vcpu_unblocking)(vcpu);
}
static inline int kvm_cpu_get_apicid(int mps_cpu)
@@ -2310,7 +2358,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_OUT_7E_INC_RIP | \
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
- KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
+ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
+ KVM_X86_QUIRK_SLOT_ZAP_ALL)
/*
* KVM previously used a u32 field in kvm_run to indicate the hypercall was
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index de3118305838..3b9970117a0f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -13,6 +13,7 @@
#define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */
#define MCG_EXT_P BIT_ULL(9) /* Extended registers available */
#define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */
+#define MCG_SEAM_NR BIT_ULL(12) /* MCG_STATUS_SEAM_NR supported */
#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
#define MCG_EXT_CNT_SHIFT 16
#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
@@ -25,6 +26,7 @@
#define MCG_STATUS_EIPV BIT_ULL(1) /* ip points to correct instruction */
#define MCG_STATUS_MCIP BIT_ULL(2) /* machine check in progress */
#define MCG_STATUS_LMCES BIT_ULL(3) /* LMCE signaled */
+#define MCG_STATUS_SEAM_NR BIT_ULL(12) /* Machine check inside SEAM non-root mode */
/* MCG_EXT_CTL register defines */
#define MCG_EXT_CTL_LMCE_EN BIT_ULL(0) /* Enable LMCE */
@@ -219,7 +221,7 @@ static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info,
u64 lapic_id) { return -EINVAL; }
#endif
-void mce_setup(struct mce *m);
+void mce_prep_record(struct mce *m);
void mce_log(struct mce *m);
DECLARE_PER_CPU(struct device *, mce_device);
@@ -259,7 +261,8 @@ enum mcp_flags {
MCP_DONTLOG = BIT(2), /* only clear, don't log */
MCP_QUEUE_LOG = BIT(3), /* only queue to genpool */
};
-bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
int mce_notify_irq(void);
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8dac45a2c7fc..2886cb668d7f 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -88,7 +88,13 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
#ifdef CONFIG_ADDRESS_MASKING
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
- return mm->context.lam_cr3_mask;
+ /*
+ * When switch_mm_irqs_off() is called for a kthread, it may race with
+ * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two
+ * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it
+ * reads a single value for both.
+ */
+ return READ_ONCE(mm->context.lam_cr3_mask);
}
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
@@ -232,11 +238,6 @@ static inline bool is_64bit_mm(struct mm_struct *mm)
}
#endif
-static inline void arch_unmap(struct mm_struct *mm, unsigned long start,
- unsigned long end)
-{
-}
-
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h
deleted file mode 100644
index c41b41edd691..000000000000
--- a/arch/x86/include/asm/mmzone.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifdef CONFIG_X86_32
-# include <asm/mmzone_32.h>
-#else
-# include <asm/mmzone_64.h>
-#endif
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
deleted file mode 100644
index 2d4515e8b7df..000000000000
--- a/arch/x86/include/asm/mmzone_32.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002
- *
- */
-
-#ifndef _ASM_X86_MMZONE_32_H
-#define _ASM_X86_MMZONE_32_H
-
-#include <asm/smp.h>
-
-#ifdef CONFIG_NUMA
-extern struct pglist_data *node_data[];
-#define NODE_DATA(nid) (node_data[nid])
-#endif /* CONFIG_NUMA */
-
-#endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
deleted file mode 100644
index 0c585046f744..000000000000
--- a/arch/x86/include/asm/mmzone_64.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* K8 NUMA support */
-/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */
-/* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */
-#ifndef _ASM_X86_MMZONE_64_H
-#define _ASM_X86_MMZONE_64_H
-
-#ifdef CONFIG_NUMA
-
-#include <linux/mmdebug.h>
-#include <asm/smp.h>
-
-extern struct pglist_data *node_data[];
-
-#define NODE_DATA(nid) (node_data[nid])
-
-#endif
-#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index c72c7ff78fcd..d593e52e6635 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -16,10 +16,10 @@ extern int pic_mode;
* Summit or generic (i.e. installer) kernels need lots of bus entries.
* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets.
*/
-#if CONFIG_BASE_SMALL == 0
-# define MAX_MP_BUSSES 260
-#else
+#ifdef CONFIG_BASE_SMALL
# define MAX_MP_BUSSES 32
+#else
+# define MAX_MP_BUSSES 260
#endif
#define MAX_IRQ_SOURCES 256
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 390c4d13956d..5f0bc6a6d025 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -40,7 +40,6 @@ static inline unsigned char hv_get_nmi_reason(void)
}
#if IS_ENABLED(CONFIG_HYPERV)
-extern int hyperv_init_cpuhp;
extern bool hyperv_paravisor_present;
extern void *hv_hypercall_pg;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e72c2b872957..3ae84c3b8e6d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -36,6 +36,20 @@
#define EFER_FFXSR (1<<_EFER_FFXSR)
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
+/*
+ * Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc.
+ * Most MSRs support/allow only a subset of memory types, but the values
+ * themselves are common across all relevant MSRs.
+ */
+#define X86_MEMTYPE_UC 0ull /* Uncacheable, a.k.a. Strong Uncacheable */
+#define X86_MEMTYPE_WC 1ull /* Write Combining */
+/* RESERVED 2 */
+/* RESERVED 3 */
+#define X86_MEMTYPE_WT 4ull /* Write Through */
+#define X86_MEMTYPE_WP 5ull /* Write Protected */
+#define X86_MEMTYPE_WB 6ull /* Write Back */
+#define X86_MEMTYPE_UC_MINUS 7ull /* Weak Uncacheabled (PAT only) */
+
/* FRED MSRs */
#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */
#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */
@@ -170,6 +184,10 @@
* CPU is not affected by Branch
* History Injection.
*/
+#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
+ * IA32_XAPIC_DISABLE_STATUS MSR
+ * supported
+ */
#define ARCH_CAP_PBRSB_NO BIT(24) /*
* Not susceptible to Post-Barrier
* Return Stack Buffer Predictions.
@@ -192,11 +210,6 @@
* File.
*/
-#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
- * IA32_XAPIC_DISABLE_STATUS MSR
- * supported
- */
-
#define MSR_IA32_FLUSH_CMD 0x0000010b
#define L1D_FLUSH BIT(0) /*
* Writeback and invalidate the
@@ -248,6 +261,8 @@
#define MSR_INTEGRITY_CAPS_ARRAY_BIST BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT)
#define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4
#define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT)
+#define MSR_INTEGRITY_CAPS_SBAF_BIT 8
+#define MSR_INTEGRITY_CAPS_SBAF BIT(MSR_INTEGRITY_CAPS_SBAF_BIT)
#define MSR_INTEGRITY_CAPS_SAF_GEN_MASK GENMASK_ULL(10, 9)
#define MSR_LBR_NHM_FROM 0x00000680
@@ -364,6 +379,12 @@
#define MSR_IA32_CR_PAT 0x00000277
+#define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7) \
+ ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \
+ (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \
+ (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \
+ (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56))
+
#define MSR_IA32_DEBUGCTLMSR 0x000001d9
#define MSR_IA32_LASTBRANCHFROMIP 0x000001db
#define MSR_IA32_LASTBRANCHTOIP 0x000001dc
@@ -567,6 +588,12 @@
#define MSR_RELOAD_PMC0 0x000014c1
#define MSR_RELOAD_FIXED_CTR0 0x00001309
+/* V6 PMON MSR range */
+#define MSR_IA32_PMC_V6_GP0_CTR 0x1900
+#define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901
+#define MSR_IA32_PMC_V6_FX0_CTR 0x1980
+#define MSR_IA32_PMC_V6_STEP 4
+
/* KeyID partitioning between MKTME and TDX */
#define MSR_IA32_MKTME_KEYID_PARTITIONING 0x00000087
@@ -661,6 +688,8 @@
#define MSR_AMD64_RMP_BASE 0xc0010132
#define MSR_AMD64_RMP_END 0xc0010133
+#define MSR_SVSM_CAA 0xc001f000
+
/* AMD Collaborative Processor Performance Control MSRs */
#define MSR_AMD_CPPC_CAP1 0xc00102b0
#define MSR_AMD_CPPC_ENABLE 0xc00102b1
@@ -782,6 +811,8 @@
#define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
#define MSR_K7_FID_VID_CTL 0xc0010041
#define MSR_K7_FID_VID_STATUS 0xc0010042
+#define MSR_K7_HWCR_CPB_DIS_BIT 25
+#define MSR_K7_HWCR_CPB_DIS BIT_ULL(MSR_K7_HWCR_CPB_DIS_BIT)
/* K6 MSRs */
#define MSR_K6_WHCR 0xc0000082
@@ -1148,15 +1179,6 @@
#define MSR_IA32_VMX_VMFUNC 0x00000491
#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492
-/* VMX_BASIC bits and bitmasks */
-#define VMX_BASIC_VMCS_SIZE_SHIFT 32
-#define VMX_BASIC_TRUE_CTLS (1ULL << 55)
-#define VMX_BASIC_64 0x0001000000000000LLU
-#define VMX_BASIC_MEM_TYPE_SHIFT 50
-#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
-#define VMX_BASIC_MEM_TYPE_WB 6LLU
-#define VMX_BASIC_INOUT 0x0040000000000000LLU
-
/* Resctrl MSRs: */
/* - Intel: */
#define MSR_IA32_L3_QOS_CFG 0xc81
@@ -1165,6 +1187,7 @@
#define MSR_IA32_QM_CTR 0xc8e
#define MSR_IA32_PQR_ASSOC 0xc8f
#define MSR_IA32_L3_CBM_BASE 0xc90
+#define MSR_RMID_SNC_CONFIG 0xca0
#define MSR_IA32_L2_CBM_BASE 0xd10
#define MSR_IA32_MBA_THRTL_BASE 0xd50
@@ -1173,11 +1196,6 @@
#define MSR_IA32_SMBA_BW_BASE 0xc0000280
#define MSR_IA32_EVT_CFG_BASE 0xc0000400
-/* MSR_IA32_VMX_MISC bits */
-#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
-#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
-#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F
-
/* AMD-V MSRs */
#define MSR_VM_CR 0xc0010114
#define MSR_VM_IGNNE 0xc0010115
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index d642037f9ed5..001853541f1e 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -99,19 +99,6 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
: : "c" (msr), "a"(low), "d" (high) : "memory");
}
-/*
- * WRMSRNS behaves exactly like WRMSR with the only difference being
- * that it is not a serializing instruction by default.
- */
-static __always_inline void __wrmsrns(u32 msr, u32 low, u32 high)
-{
- /* Instruction opcode for WRMSRNS; supported in binutils >= 2.40. */
- asm volatile("1: .byte 0x0f,0x01,0xc6\n"
- "2:\n"
- _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
- : : "c" (msr), "a"(low), "d" (high));
-}
-
#define native_rdmsr(msr, val1, val2) \
do { \
u64 __val = __rdmsr((msr)); \
@@ -312,9 +299,19 @@ do { \
#endif /* !CONFIG_PARAVIRT_XXL */
+/* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */
+#define WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6)
+
+/* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */
static __always_inline void wrmsrns(u32 msr, u64 val)
{
- __wrmsrns(msr, val, val >> 32);
+ /*
+ * WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant
+ * DS prefix to avoid a trailing NOP.
+ */
+ asm volatile("1: " ALTERNATIVE("ds wrmsr", WRMSRNS, X86_FEATURE_WRMSRNS)
+ "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
+ : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)));
}
/*
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 090d658a85a6..4218248083d9 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -69,7 +69,6 @@ extern int mtrr_add_page(unsigned long base, unsigned long size,
unsigned int type, bool increment);
extern int mtrr_del(int reg, unsigned long base, unsigned long size);
extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
-extern void mtrr_bp_restore(void);
extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
extern int amd_special_default_mtrr(void);
void mtrr_disable(void);
@@ -117,7 +116,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
return 0;
}
#define mtrr_bp_init() do {} while (0)
-#define mtrr_bp_restore() do {} while (0)
#define mtrr_disable() do {} while (0)
#define mtrr_enable() do {} while (0)
#define mtrr_generic_set_state() do {} while (0)
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index ef2844d69173..5469d7a7c40f 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -10,8 +10,6 @@
#ifdef CONFIG_NUMA
-#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
-
extern int numa_off;
/*
@@ -25,9 +23,6 @@ extern int numa_off;
extern s16 __apicid_to_node[MAX_LOCAL_APIC];
extern nodemask_t numa_nodes_parsed __initdata;
-extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
-extern void __init numa_set_distance(int from, int to, int distance);
-
static inline void set_apicid_to_node(int apicid, s16 node)
{
__apicid_to_node[apicid] = node;
@@ -54,31 +49,20 @@ static inline int numa_cpu_node(int cpu)
extern void numa_set_node(int cpu, int node);
extern void numa_clear_node(int cpu);
extern void __init init_cpu_to_node(void);
-extern void numa_add_cpu(int cpu);
-extern void numa_remove_cpu(int cpu);
+extern void numa_add_cpu(unsigned int cpu);
+extern void numa_remove_cpu(unsigned int cpu);
extern void init_gi_nodes(void);
#else /* CONFIG_NUMA */
static inline void numa_set_node(int cpu, int node) { }
static inline void numa_clear_node(int cpu) { }
static inline void init_cpu_to_node(void) { }
-static inline void numa_add_cpu(int cpu) { }
-static inline void numa_remove_cpu(int cpu) { }
+static inline void numa_add_cpu(unsigned int cpu) { }
+static inline void numa_remove_cpu(unsigned int cpu) { }
static inline void init_gi_nodes(void) { }
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-void debug_cpumask_set_cpu(int cpu, int node, bool enable);
+void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable);
#endif
-#ifdef CONFIG_NUMA_EMU
-#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
-#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
-int numa_emu_cmdline(char *str);
-#else /* CONFIG_NUMA_EMU */
-static inline int numa_emu_cmdline(char *str)
-{
- return -EINVAL;
-}
-#endif /* CONFIG_NUMA_EMU */
-
#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index cc6b8e087192..f3d257c45225 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -17,6 +17,7 @@ extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
+extern unsigned long physmem_end;
static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
@@ -54,7 +55,7 @@ static inline void clear_page(void *page)
clear_page_rep, X86_FEATURE_REP_GOOD,
clear_page_erms, X86_FEATURE_ERMS,
"=D" (page),
- "0" (page)
+ "D" (page)
: "cc", "memory", "rax", "rcx");
}
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 9da9c8a2f1df..52f1b4ff0cc1 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -31,10 +31,12 @@
#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
-#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \
- CONFIG_PHYSICAL_ALIGN)
+/* Physical address where kernel should be loaded. */
+#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
+ + (CONFIG_PHYSICAL_ALIGN - 1)) \
+ & ~(CONFIG_PHYSICAL_ALIGN - 1))
-#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
+#define __START_KERNEL (__START_KERNEL_map + LOAD_PHYSICAL_ADDR)
#ifdef CONFIG_X86_64
#include <asm/page_64_types.h>
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 44958ebaf626..c55a79d5feae 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -3,30 +3,30 @@
#define _ASM_X86_PERCPU_H
#ifdef CONFIG_X86_64
-#define __percpu_seg gs
-#define __percpu_rel (%rip)
+# define __percpu_seg gs
+# define __percpu_rel (%rip)
#else
-#define __percpu_seg fs
-#define __percpu_rel
+# define __percpu_seg fs
+# define __percpu_rel
#endif
#ifdef __ASSEMBLY__
#ifdef CONFIG_SMP
-#define __percpu %__percpu_seg:
+# define __percpu %__percpu_seg:
#else
-#define __percpu
+# define __percpu
#endif
#define PER_CPU_VAR(var) __percpu(var)__percpu_rel
#ifdef CONFIG_X86_64_SMP
-#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
+# define INIT_PER_CPU_VAR(var) init_per_cpu__##var
#else
-#define INIT_PER_CPU_VAR(var) var
+# define INIT_PER_CPU_VAR(var) var
#endif
-#else /* ...!ASSEMBLY */
+#else /* !__ASSEMBLY__: */
#include <linux/build_bug.h>
#include <linux/stringify.h>
@@ -37,19 +37,19 @@
#ifdef CONFIG_CC_HAS_NAMED_AS
#ifdef __CHECKER__
-#define __seg_gs __attribute__((address_space(__seg_gs)))
-#define __seg_fs __attribute__((address_space(__seg_fs)))
+# define __seg_gs __attribute__((address_space(__seg_gs)))
+# define __seg_fs __attribute__((address_space(__seg_fs)))
#endif
#ifdef CONFIG_X86_64
-#define __percpu_seg_override __seg_gs
+# define __percpu_seg_override __seg_gs
#else
-#define __percpu_seg_override __seg_fs
+# define __percpu_seg_override __seg_fs
#endif
#define __percpu_prefix ""
-#else /* CONFIG_CC_HAS_NAMED_AS */
+#else /* !CONFIG_CC_HAS_NAMED_AS: */
#define __percpu_seg_override
#define __percpu_prefix "%%"__stringify(__percpu_seg)":"
@@ -59,40 +59,30 @@
#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":"
#define __my_cpu_offset this_cpu_read(this_cpu_off)
-#ifdef CONFIG_USE_X86_SEG_SUPPORT
-/*
- * Efficient implementation for cases in which the compiler supports
- * named address spaces. Allows the compiler to perform additional
- * optimizations that can save more instructions.
- */
-#define arch_raw_cpu_ptr(ptr) \
-({ \
- unsigned long tcp_ptr__; \
- tcp_ptr__ = __raw_cpu_read(, this_cpu_off); \
- \
- tcp_ptr__ += (unsigned long)(ptr); \
- (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
-})
-#else /* CONFIG_USE_X86_SEG_SUPPORT */
/*
* Compared to the generic __my_cpu_offset version, the following
* saves one instruction and avoids clobbering a temp register.
+ *
+ * arch_raw_cpu_ptr should not be used in 32-bit VDSO for a 64-bit
+ * kernel, because games are played with CONFIG_X86_64 there and
+ * sizeof(this_cpu_off) becames 4.
*/
-#define arch_raw_cpu_ptr(ptr) \
-({ \
- unsigned long tcp_ptr__; \
- asm ("mov " __percpu_arg(1) ", %0" \
- : "=r" (tcp_ptr__) \
- : "m" (__my_cpu_var(this_cpu_off))); \
- \
- tcp_ptr__ += (unsigned long)(ptr); \
- (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
+#ifndef BUILD_VDSO32_64
+#define arch_raw_cpu_ptr(_ptr) \
+({ \
+ unsigned long tcp_ptr__ = raw_cpu_read_long(this_cpu_off); \
+ \
+ tcp_ptr__ += (__force unsigned long)(_ptr); \
+ (typeof(*(_ptr)) __kernel __force *)tcp_ptr__; \
})
-#endif /* CONFIG_USE_X86_SEG_SUPPORT */
+#else
+#define arch_raw_cpu_ptr(_ptr) ({ BUILD_BUG(); (typeof(_ptr))0; })
+#endif
#define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel
-#else /* CONFIG_SMP */
+#else /* !CONFIG_SMP: */
+
#define __percpu_seg_override
#define __percpu_prefix ""
#define __force_percpu_prefix ""
@@ -102,13 +92,13 @@
#endif /* CONFIG_SMP */
#define __my_cpu_type(var) typeof(var) __percpu_seg_override
-#define __my_cpu_ptr(ptr) (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
-#define __my_cpu_var(var) (*__my_cpu_ptr(&var))
+#define __my_cpu_ptr(ptr) (__my_cpu_type(*(ptr))*)(__force uintptr_t)(ptr)
+#define __my_cpu_var(var) (*__my_cpu_ptr(&(var)))
#define __percpu_arg(x) __percpu_prefix "%" #x
#define __force_percpu_arg(x) __force_percpu_prefix "%" #x
/*
- * Initialized pointers to per-cpu variables needed for the boot
+ * Initialized pointers to per-CPU variables needed for the boot
* processor need to use these macros to get the proper address
* offset from __per_cpu_load on SMP.
*
@@ -118,65 +108,128 @@
extern typeof(var) init_per_cpu_var(var)
#ifdef CONFIG_X86_64_SMP
-#define init_per_cpu_var(var) init_per_cpu__##var
+# define init_per_cpu_var(var) init_per_cpu__##var
#else
-#define init_per_cpu_var(var) var
+# define init_per_cpu_var(var) var
#endif
-/* For arch-specific code, we can use direct single-insn ops (they
- * don't give an lvalue though). */
+/*
+ * For arch-specific code, we can use direct single-insn ops (they
+ * don't give an lvalue though).
+ */
-#define __pcpu_type_1 u8
-#define __pcpu_type_2 u16
-#define __pcpu_type_4 u32
-#define __pcpu_type_8 u64
+#define __pcpu_type_1 u8
+#define __pcpu_type_2 u16
+#define __pcpu_type_4 u32
+#define __pcpu_type_8 u64
-#define __pcpu_cast_1(val) ((u8)(((unsigned long) val) & 0xff))
-#define __pcpu_cast_2(val) ((u16)(((unsigned long) val) & 0xffff))
-#define __pcpu_cast_4(val) ((u32)(((unsigned long) val) & 0xffffffff))
-#define __pcpu_cast_8(val) ((u64)(val))
+#define __pcpu_cast_1(val) ((u8)(((unsigned long) val) & 0xff))
+#define __pcpu_cast_2(val) ((u16)(((unsigned long) val) & 0xffff))
+#define __pcpu_cast_4(val) ((u32)(((unsigned long) val) & 0xffffffff))
+#define __pcpu_cast_8(val) ((u64)(val))
-#define __pcpu_op1_1(op, dst) op "b " dst
-#define __pcpu_op1_2(op, dst) op "w " dst
-#define __pcpu_op1_4(op, dst) op "l " dst
-#define __pcpu_op1_8(op, dst) op "q " dst
+#define __pcpu_op1_1(op, dst) op "b " dst
+#define __pcpu_op1_2(op, dst) op "w " dst
+#define __pcpu_op1_4(op, dst) op "l " dst
+#define __pcpu_op1_8(op, dst) op "q " dst
#define __pcpu_op2_1(op, src, dst) op "b " src ", " dst
#define __pcpu_op2_2(op, src, dst) op "w " src ", " dst
#define __pcpu_op2_4(op, src, dst) op "l " src ", " dst
#define __pcpu_op2_8(op, src, dst) op "q " src ", " dst
-#define __pcpu_reg_1(mod, x) mod "q" (x)
-#define __pcpu_reg_2(mod, x) mod "r" (x)
-#define __pcpu_reg_4(mod, x) mod "r" (x)
-#define __pcpu_reg_8(mod, x) mod "r" (x)
+#define __pcpu_reg_1(mod, x) mod "q" (x)
+#define __pcpu_reg_2(mod, x) mod "r" (x)
+#define __pcpu_reg_4(mod, x) mod "r" (x)
+#define __pcpu_reg_8(mod, x) mod "r" (x)
+
+#define __pcpu_reg_imm_1(x) "qi" (x)
+#define __pcpu_reg_imm_2(x) "ri" (x)
+#define __pcpu_reg_imm_4(x) "ri" (x)
+#define __pcpu_reg_imm_8(x) "re" (x)
+
+#ifdef CONFIG_USE_X86_SEG_SUPPORT
-#define __pcpu_reg_imm_1(x) "qi" (x)
-#define __pcpu_reg_imm_2(x) "ri" (x)
-#define __pcpu_reg_imm_4(x) "ri" (x)
-#define __pcpu_reg_imm_8(x) "re" (x)
+#define __raw_cpu_read(size, qual, pcp) \
+({ \
+ *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \
+})
-#define percpu_to_op(size, qual, op, _var, _val) \
+#define __raw_cpu_write(size, qual, pcp, val) \
+do { \
+ *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \
+} while (0)
+
+#define __raw_cpu_read_const(pcp) __raw_cpu_read(, , pcp)
+
+#else /* !CONFIG_USE_X86_SEG_SUPPORT: */
+
+#define __raw_cpu_read(size, qual, _var) \
+({ \
+ __pcpu_type_##size pfo_val__; \
+ \
+ asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), "%[val]") \
+ : [val] __pcpu_reg_##size("=", pfo_val__) \
+ : [var] "m" (__my_cpu_var(_var))); \
+ \
+ (typeof(_var))(unsigned long) pfo_val__; \
+})
+
+#define __raw_cpu_write(size, qual, _var, _val) \
do { \
__pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \
+ \
if (0) { \
typeof(_var) pto_tmp__; \
pto_tmp__ = (_val); \
(void)pto_tmp__; \
} \
- asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \
- : [var] "+m" (__my_cpu_var(_var)) \
+ asm qual(__pcpu_op2_##size("mov", "%[val]", __percpu_arg([var])) \
+ : [var] "=m" (__my_cpu_var(_var)) \
: [val] __pcpu_reg_imm_##size(pto_val__)); \
} while (0)
+/*
+ * The generic per-CPU infrastrucutre is not suitable for
+ * reading const-qualified variables.
+ */
+#define __raw_cpu_read_const(pcp) ({ BUILD_BUG(); (typeof(pcp))0; })
+
+#endif /* CONFIG_USE_X86_SEG_SUPPORT */
+
+#define __raw_cpu_read_stable(size, _var) \
+({ \
+ __pcpu_type_##size pfo_val__; \
+ \
+ asm(__pcpu_op2_##size("mov", __force_percpu_arg(a[var]), "%[val]") \
+ : [val] __pcpu_reg_##size("=", pfo_val__) \
+ : [var] "i" (&(_var))); \
+ \
+ (typeof(_var))(unsigned long) pfo_val__; \
+})
+
#define percpu_unary_op(size, qual, op, _var) \
({ \
asm qual (__pcpu_op1_##size(op, __percpu_arg([var])) \
: [var] "+m" (__my_cpu_var(_var))); \
})
+#define percpu_binary_op(size, qual, op, _var, _val) \
+do { \
+ __pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \
+ \
+ if (0) { \
+ typeof(_var) pto_tmp__; \
+ pto_tmp__ = (_val); \
+ (void)pto_tmp__; \
+ } \
+ asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \
+ : [var] "+m" (__my_cpu_var(_var)) \
+ : [val] __pcpu_reg_imm_##size(pto_val__)); \
+} while (0)
+
/*
- * Generate a percpu add to memory instruction and optimize code
+ * Generate a per-CPU add to memory instruction and optimize code
* if one is added or subtracted.
*/
#define percpu_add_op(size, qual, var, val) \
@@ -184,6 +237,7 @@ do { \
const int pao_ID__ = (__builtin_constant_p(val) && \
((val) == 1 || (val) == -1)) ? \
(int)(val) : 0; \
+ \
if (0) { \
typeof(var) pao_tmp__; \
pao_tmp__ = (val); \
@@ -194,33 +248,16 @@ do { \
else if (pao_ID__ == -1) \
percpu_unary_op(size, qual, "dec", var); \
else \
- percpu_to_op(size, qual, "add", var, val); \
+ percpu_binary_op(size, qual, "add", var, val); \
} while (0)
-#define percpu_from_op(size, qual, op, _var) \
-({ \
- __pcpu_type_##size pfo_val__; \
- asm qual (__pcpu_op2_##size(op, __percpu_arg([var]), "%[val]") \
- : [val] __pcpu_reg_##size("=", pfo_val__) \
- : [var] "m" (__my_cpu_var(_var))); \
- (typeof(_var))(unsigned long) pfo_val__; \
-})
-
-#define percpu_stable_op(size, op, _var) \
-({ \
- __pcpu_type_##size pfo_val__; \
- asm(__pcpu_op2_##size(op, __force_percpu_arg(a[var]), "%[val]") \
- : [val] __pcpu_reg_##size("=", pfo_val__) \
- : [var] "i" (&(_var))); \
- (typeof(_var))(unsigned long) pfo_val__; \
-})
-
/*
* Add return operation
*/
#define percpu_add_return_op(size, qual, _var, _val) \
({ \
__pcpu_type_##size paro_tmp__ = __pcpu_cast_##size(_val); \
+ \
asm qual (__pcpu_op2_##size("xadd", "%[tmp]", \
__percpu_arg([var])) \
: [tmp] __pcpu_reg_##size("+", paro_tmp__), \
@@ -230,41 +267,48 @@ do { \
})
/*
- * xchg is implemented using cmpxchg without a lock prefix. xchg is
- * expensive due to the implied lock prefix. The processor cannot prefetch
- * cachelines if xchg is used.
+ * raw_cpu_xchg() can use a load-store since
+ * it is not required to be IRQ-safe.
*/
-#define percpu_xchg_op(size, qual, _var, _nval) \
+#define raw_percpu_xchg_op(_var, _nval) \
({ \
- __pcpu_type_##size pxo_old__; \
- __pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval); \
- asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), \
- "%[oval]") \
- "\n1:\t" \
- __pcpu_op2_##size("cmpxchg", "%[nval]", \
- __percpu_arg([var])) \
- "\n\tjnz 1b" \
- : [oval] "=&a" (pxo_old__), \
- [var] "+m" (__my_cpu_var(_var)) \
- : [nval] __pcpu_reg_##size(, pxo_new__) \
- : "memory"); \
- (typeof(_var))(unsigned long) pxo_old__; \
+ typeof(_var) pxo_old__ = raw_cpu_read(_var); \
+ \
+ raw_cpu_write(_var, _nval); \
+ \
+ pxo_old__; \
+})
+
+/*
+ * this_cpu_xchg() is implemented using CMPXCHG without a LOCK prefix.
+ * XCHG is expensive due to the implied LOCK prefix. The processor
+ * cannot prefetch cachelines if XCHG is used.
+ */
+#define this_percpu_xchg_op(_var, _nval) \
+({ \
+ typeof(_var) pxo_old__ = this_cpu_read(_var); \
+ \
+ do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \
+ \
+ pxo_old__; \
})
/*
- * cmpxchg has no such implied lock semantics as a result it is much
- * more efficient for cpu local operations.
+ * CMPXCHG has no such implied lock semantics as a result it is much
+ * more efficient for CPU-local operations.
*/
#define percpu_cmpxchg_op(size, qual, _var, _oval, _nval) \
({ \
__pcpu_type_##size pco_old__ = __pcpu_cast_##size(_oval); \
__pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
+ \
asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \
__percpu_arg([var])) \
: [oval] "+a" (pco_old__), \
[var] "+m" (__my_cpu_var(_var)) \
: [nval] __pcpu_reg_##size(, pco_new__) \
: "memory"); \
+ \
(typeof(_var))(unsigned long) pco_old__; \
})
@@ -274,6 +318,7 @@ do { \
__pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
__pcpu_type_##size pco_old__ = *pco_oval__; \
__pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
+ \
asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \
__percpu_arg([var])) \
CC_SET(z) \
@@ -284,10 +329,12 @@ do { \
: "memory"); \
if (unlikely(!success)) \
*pco_oval__ = pco_old__; \
+ \
likely(success); \
})
#if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
+
#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \
({ \
union { \
@@ -313,8 +360,8 @@ do { \
old__.var; \
})
-#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval)
-#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
+#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval)
+#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval) \
({ \
@@ -343,16 +390,18 @@ do { \
: "memory"); \
if (unlikely(!success)) \
*_oval = old__.var; \
+ \
likely(success); \
})
#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, , pcp, ovalp, nval)
#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval)
-#endif
+
+#endif /* defined(CONFIG_X86_32) && !defined(CONFIG_UML) */
#ifdef CONFIG_X86_64
-#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval);
-#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval);
+#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval);
+#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval);
#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval);
#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval);
@@ -382,8 +431,8 @@ do { \
old__.var; \
})
-#define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval)
-#define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
+#define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval)
+#define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval) \
({ \
@@ -417,199 +466,151 @@ do { \
#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, , pcp, ovalp, nval)
#define this_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval)
-#endif
+
+#endif /* CONFIG_X86_64 */
+
+#define raw_cpu_read_1(pcp) __raw_cpu_read(1, , pcp)
+#define raw_cpu_read_2(pcp) __raw_cpu_read(2, , pcp)
+#define raw_cpu_read_4(pcp) __raw_cpu_read(4, , pcp)
+#define raw_cpu_write_1(pcp, val) __raw_cpu_write(1, , pcp, val)
+#define raw_cpu_write_2(pcp, val) __raw_cpu_write(2, , pcp, val)
+#define raw_cpu_write_4(pcp, val) __raw_cpu_write(4, , pcp, val)
+
+#define this_cpu_read_1(pcp) __raw_cpu_read(1, volatile, pcp)
+#define this_cpu_read_2(pcp) __raw_cpu_read(2, volatile, pcp)
+#define this_cpu_read_4(pcp) __raw_cpu_read(4, volatile, pcp)
+#define this_cpu_write_1(pcp, val) __raw_cpu_write(1, volatile, pcp, val)
+#define this_cpu_write_2(pcp, val) __raw_cpu_write(2, volatile, pcp, val)
+#define this_cpu_write_4(pcp, val) __raw_cpu_write(4, volatile, pcp, val)
+
+#define this_cpu_read_stable_1(pcp) __raw_cpu_read_stable(1, pcp)
+#define this_cpu_read_stable_2(pcp) __raw_cpu_read_stable(2, pcp)
+#define this_cpu_read_stable_4(pcp) __raw_cpu_read_stable(4, pcp)
+
+#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val)
+#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val)
+#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val)
+#define raw_cpu_and_1(pcp, val) percpu_binary_op(1, , "and", (pcp), val)
+#define raw_cpu_and_2(pcp, val) percpu_binary_op(2, , "and", (pcp), val)
+#define raw_cpu_and_4(pcp, val) percpu_binary_op(4, , "and", (pcp), val)
+#define raw_cpu_or_1(pcp, val) percpu_binary_op(1, , "or", (pcp), val)
+#define raw_cpu_or_2(pcp, val) percpu_binary_op(2, , "or", (pcp), val)
+#define raw_cpu_or_4(pcp, val) percpu_binary_op(4, , "or", (pcp), val)
+#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val)
+#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val)
+#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val)
+
+#define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val)
+#define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val)
+#define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val)
+#define this_cpu_and_1(pcp, val) percpu_binary_op(1, volatile, "and", (pcp), val)
+#define this_cpu_and_2(pcp, val) percpu_binary_op(2, volatile, "and", (pcp), val)
+#define this_cpu_and_4(pcp, val) percpu_binary_op(4, volatile, "and", (pcp), val)
+#define this_cpu_or_1(pcp, val) percpu_binary_op(1, volatile, "or", (pcp), val)
+#define this_cpu_or_2(pcp, val) percpu_binary_op(2, volatile, "or", (pcp), val)
+#define this_cpu_or_4(pcp, val) percpu_binary_op(4, volatile, "or", (pcp), val)
+#define this_cpu_xchg_1(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval) this_percpu_xchg_op(pcp, nval)
+
+#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val)
+#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val)
+#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(4, , pcp, val)
+#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval)
+#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval)
+#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
+
+#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val)
+#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val)
+#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(4, volatile, pcp, val)
+#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
+#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
+#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
/*
- * this_cpu_read() makes gcc load the percpu variable every time it is
- * accessed while this_cpu_read_stable() allows the value to be cached.
- * this_cpu_read_stable() is more efficient and can be used if its value
- * is guaranteed to be valid across cpus. The current users include
- * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are
- * actually per-thread variables implemented as per-CPU variables and
- * thus stable for the duration of the respective task.
+ * Per-CPU atomic 64-bit operations are only available under 64-bit kernels.
+ * 32-bit kernels must fall back to generic operations.
*/
-#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
-#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)
-#define this_cpu_read_stable_4(pcp) percpu_stable_op(4, "mov", pcp)
-#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp)
-#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp)
+#ifdef CONFIG_X86_64
-#ifdef CONFIG_USE_X86_SEG_SUPPORT
+#define raw_cpu_read_8(pcp) __raw_cpu_read(8, , pcp)
+#define raw_cpu_write_8(pcp, val) __raw_cpu_write(8, , pcp, val)
-#define __raw_cpu_read(qual, pcp) \
-({ \
- *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \
-})
-
-#define __raw_cpu_write(qual, pcp, val) \
-do { \
- *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \
-} while (0)
+#define this_cpu_read_8(pcp) __raw_cpu_read(8, volatile, pcp)
+#define this_cpu_write_8(pcp, val) __raw_cpu_write(8, volatile, pcp, val)
-#define raw_cpu_read_1(pcp) __raw_cpu_read(, pcp)
-#define raw_cpu_read_2(pcp) __raw_cpu_read(, pcp)
-#define raw_cpu_read_4(pcp) __raw_cpu_read(, pcp)
-#define raw_cpu_write_1(pcp, val) __raw_cpu_write(, pcp, val)
-#define raw_cpu_write_2(pcp, val) __raw_cpu_write(, pcp, val)
-#define raw_cpu_write_4(pcp, val) __raw_cpu_write(, pcp, val)
+#define this_cpu_read_stable_8(pcp) __raw_cpu_read_stable(8, pcp)
-#define this_cpu_read_1(pcp) __raw_cpu_read(volatile, pcp)
-#define this_cpu_read_2(pcp) __raw_cpu_read(volatile, pcp)
-#define this_cpu_read_4(pcp) __raw_cpu_read(volatile, pcp)
-#define this_cpu_write_1(pcp, val) __raw_cpu_write(volatile, pcp, val)
-#define this_cpu_write_2(pcp, val) __raw_cpu_write(volatile, pcp, val)
-#define this_cpu_write_4(pcp, val) __raw_cpu_write(volatile, pcp, val)
+#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val)
+#define raw_cpu_and_8(pcp, val) percpu_binary_op(8, , "and", (pcp), val)
+#define raw_cpu_or_8(pcp, val) percpu_binary_op(8, , "or", (pcp), val)
+#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val)
+#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval)
+#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
-#ifdef CONFIG_X86_64
-#define raw_cpu_read_8(pcp) __raw_cpu_read(, pcp)
-#define raw_cpu_write_8(pcp, val) __raw_cpu_write(, pcp, val)
+#define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val)
+#define this_cpu_and_8(pcp, val) percpu_binary_op(8, volatile, "and", (pcp), val)
+#define this_cpu_or_8(pcp, val) percpu_binary_op(8, volatile, "or", (pcp), val)
+#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
+#define this_cpu_xchg_8(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
-#define this_cpu_read_8(pcp) __raw_cpu_read(volatile, pcp)
-#define this_cpu_write_8(pcp, val) __raw_cpu_write(volatile, pcp, val)
-#endif
+#define raw_cpu_read_long(pcp) raw_cpu_read_8(pcp)
-#define this_cpu_read_const(pcp) __raw_cpu_read(, pcp)
-#else /* CONFIG_USE_X86_SEG_SUPPORT */
+#else /* !CONFIG_X86_64: */
-#define raw_cpu_read_1(pcp) percpu_from_op(1, , "mov", pcp)
-#define raw_cpu_read_2(pcp) percpu_from_op(2, , "mov", pcp)
-#define raw_cpu_read_4(pcp) percpu_from_op(4, , "mov", pcp)
-#define raw_cpu_write_1(pcp, val) percpu_to_op(1, , "mov", (pcp), val)
-#define raw_cpu_write_2(pcp, val) percpu_to_op(2, , "mov", (pcp), val)
-#define raw_cpu_write_4(pcp, val) percpu_to_op(4, , "mov", (pcp), val)
+/* There is no generic 64-bit read stable operation for 32-bit targets. */
+#define this_cpu_read_stable_8(pcp) ({ BUILD_BUG(); (typeof(pcp))0; })
-#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp)
-#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp)
-#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp)
-#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val)
-#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val)
-#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val)
+#define raw_cpu_read_long(pcp) raw_cpu_read_4(pcp)
-#ifdef CONFIG_X86_64
-#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp)
-#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val)
+#endif /* CONFIG_X86_64 */
-#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
-#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
-#endif
+#define this_cpu_read_const(pcp) __raw_cpu_read_const(pcp)
/*
- * The generic per-cpu infrastrucutre is not suitable for
- * reading const-qualified variables.
+ * this_cpu_read() makes the compiler load the per-CPU variable every time
+ * it is accessed while this_cpu_read_stable() allows the value to be cached.
+ * this_cpu_read_stable() is more efficient and can be used if its value
+ * is guaranteed to be valid across CPUs. The current users include
+ * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are
+ * actually per-thread variables implemented as per-CPU variables and
+ * thus stable for the duration of the respective task.
*/
-#define this_cpu_read_const(pcp) ({ BUILD_BUG(); (typeof(pcp))0; })
-#endif /* CONFIG_USE_X86_SEG_SUPPORT */
+#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp)
-#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val)
-#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val)
-#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val)
-#define raw_cpu_and_1(pcp, val) percpu_to_op(1, , "and", (pcp), val)
-#define raw_cpu_and_2(pcp, val) percpu_to_op(2, , "and", (pcp), val)
-#define raw_cpu_and_4(pcp, val) percpu_to_op(4, , "and", (pcp), val)
-#define raw_cpu_or_1(pcp, val) percpu_to_op(1, , "or", (pcp), val)
-#define raw_cpu_or_2(pcp, val) percpu_to_op(2, , "or", (pcp), val)
-#define raw_cpu_or_4(pcp, val) percpu_to_op(4, , "or", (pcp), val)
-
-/*
- * raw_cpu_xchg() can use a load-store since it is not required to be
- * IRQ-safe.
- */
-#define raw_percpu_xchg_op(var, nval) \
+#define x86_this_cpu_constant_test_bit(_nr, _var) \
({ \
- typeof(var) pxo_ret__ = raw_cpu_read(var); \
- raw_cpu_write(var, (nval)); \
- pxo_ret__; \
+ unsigned long __percpu *addr__ = \
+ (unsigned long __percpu *)&(_var) + ((_nr) / BITS_PER_LONG); \
+ \
+ !!((1UL << ((_nr) % BITS_PER_LONG)) & raw_cpu_read(*addr__)); \
})
-#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val)
-#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val)
-#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val)
-
-#define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val)
-#define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val)
-#define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val)
-#define this_cpu_and_1(pcp, val) percpu_to_op(1, volatile, "and", (pcp), val)
-#define this_cpu_and_2(pcp, val) percpu_to_op(2, volatile, "and", (pcp), val)
-#define this_cpu_and_4(pcp, val) percpu_to_op(4, volatile, "and", (pcp), val)
-#define this_cpu_or_1(pcp, val) percpu_to_op(1, volatile, "or", (pcp), val)
-#define this_cpu_or_2(pcp, val) percpu_to_op(2, volatile, "or", (pcp), val)
-#define this_cpu_or_4(pcp, val) percpu_to_op(4, volatile, "or", (pcp), val)
-#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(1, volatile, pcp, nval)
-#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(2, volatile, pcp, nval)
-#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(4, volatile, pcp, nval)
-
-#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val)
-#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val)
-#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(4, , pcp, val)
-#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval)
-#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval)
-#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval)
-#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
-#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
-#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
-
-#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val)
-#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val)
-#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(4, volatile, pcp, val)
-#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
-#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
-#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
-#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
-#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
-#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
-
-/*
- * Per cpu atomic 64 bit operations are only available under 64 bit.
- * 32 bit must fall back to generic operations.
- */
-#ifdef CONFIG_X86_64
-#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val)
-#define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val)
-#define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val)
-#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val)
-#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval)
-#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
-#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
-
-#define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val)
-#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val)
-#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val)
-#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
-#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
-#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
-#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
-#endif
-
-static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
- const unsigned long __percpu *addr)
-{
- unsigned long __percpu *a =
- (unsigned long __percpu *)addr + nr / BITS_PER_LONG;
-
-#ifdef CONFIG_X86_64
- return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
-#else
- return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_4(*a)) != 0;
-#endif
-}
-
-static inline bool x86_this_cpu_variable_test_bit(int nr,
- const unsigned long __percpu *addr)
-{
- bool oldbit;
-
- asm volatile("btl "__percpu_arg(2)",%1"
- CC_SET(c)
- : CC_OUT(c) (oldbit)
- : "m" (*__my_cpu_ptr((unsigned long __percpu *)(addr))), "Ir" (nr));
-
- return oldbit;
-}
+#define x86_this_cpu_variable_test_bit(_nr, _var) \
+({ \
+ bool oldbit; \
+ \
+ asm volatile("btl %[nr], " __percpu_arg([var]) \
+ CC_SET(c) \
+ : CC_OUT(c) (oldbit) \
+ : [var] "m" (__my_cpu_var(_var)), \
+ [nr] "rI" (_nr)); \
+ oldbit; \
+})
-#define x86_this_cpu_test_bit(nr, addr) \
- (__builtin_constant_p((nr)) \
- ? x86_this_cpu_constant_test_bit((nr), (addr)) \
- : x86_this_cpu_variable_test_bit((nr), (addr)))
+#define x86_this_cpu_test_bit(_nr, _var) \
+ (__builtin_constant_p(_nr) \
+ ? x86_this_cpu_constant_test_bit(_nr, _var) \
+ : x86_this_cpu_variable_test_bit(_nr, _var))
#include <asm-generic/percpu.h>
@@ -639,46 +640,47 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off);
{ [0 ... NR_CPUS-1] = _initvalue }; \
__typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
-#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
EXPORT_PER_CPU_SYMBOL(_name)
-#define DECLARE_EARLY_PER_CPU(_type, _name) \
- DECLARE_PER_CPU(_type, _name); \
- extern __typeof__(_type) *_name##_early_ptr; \
+#define DECLARE_EARLY_PER_CPU(_type, _name) \
+ DECLARE_PER_CPU(_type, _name); \
+ extern __typeof__(_type) *_name##_early_ptr; \
extern __typeof__(_type) _name##_early_map[]
-#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
- DECLARE_PER_CPU_READ_MOSTLY(_type, _name); \
- extern __typeof__(_type) *_name##_early_ptr; \
+#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
+ DECLARE_PER_CPU_READ_MOSTLY(_type, _name); \
+ extern __typeof__(_type) *_name##_early_ptr; \
extern __typeof__(_type) _name##_early_map[]
-#define early_per_cpu_ptr(_name) (_name##_early_ptr)
-#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
-#define early_per_cpu(_name, _cpu) \
- *(early_per_cpu_ptr(_name) ? \
- &early_per_cpu_ptr(_name)[_cpu] : \
+#define early_per_cpu_ptr(_name) (_name##_early_ptr)
+#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
+
+#define early_per_cpu(_name, _cpu) \
+ *(early_per_cpu_ptr(_name) ? \
+ &early_per_cpu_ptr(_name)[_cpu] : \
&per_cpu(_name, _cpu))
-#else /* !CONFIG_SMP */
-#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
+#else /* !CONFIG_SMP: */
+#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
DEFINE_PER_CPU(_type, _name) = _initvalue
#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \
DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue
-#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
EXPORT_PER_CPU_SYMBOL(_name)
-#define DECLARE_EARLY_PER_CPU(_type, _name) \
+#define DECLARE_EARLY_PER_CPU(_type, _name) \
DECLARE_PER_CPU(_type, _name)
-#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
+#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
DECLARE_PER_CPU_READ_MOSTLY(_type, _name)
-#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
-#define early_per_cpu_ptr(_name) NULL
+#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
+#define early_per_cpu_ptr(_name) NULL
/* no early_per_cpu_map() */
-#endif /* !CONFIG_SMP */
+#endif /* !CONFIG_SMP */
#endif /* _ASM_X86_PERCPU_H */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 7f1e17250546..91b73571412f 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -32,6 +32,8 @@
#define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23)
#define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL
#define ARCH_PERFMON_EVENTSEL_BR_CNTR (1ULL << 35)
+#define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36)
+#define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40)
#define INTEL_FIXED_BITS_MASK 0xFULL
#define INTEL_FIXED_BITS_STRIDE 4
@@ -185,6 +187,8 @@ union cpuid10_edx {
* detection/enumeration details:
*/
#define ARCH_PERFMON_EXT_LEAF 0x00000023
+#define ARCH_PERFMON_EXT_UMASK2 0x1
+#define ARCH_PERFMON_EXT_EQ 0x2
#define ARCH_PERFMON_NUM_COUNTER_LEAF_BIT 0x1
#define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1
@@ -307,6 +311,10 @@ struct x86_pmu_capability {
#define INTEL_PMC_IDX_FIXED_SLOTS (INTEL_PMC_IDX_FIXED + 3)
#define INTEL_PMC_MSK_FIXED_SLOTS (1ULL << INTEL_PMC_IDX_FIXED_SLOTS)
+/* TOPDOWN_BAD_SPECULATION.ALL: fixed counter 4 (Atom only) */
+/* TOPDOWN_FE_BOUND.ALL: fixed counter 5 (Atom only) */
+/* TOPDOWN_RETIRING.ALL: fixed counter 6 (Atom only) */
+
static inline bool use_fixed_pseudo_encoding(u64 code)
{
return !(code & 0xff);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 315535ffb258..4c2d080d26b4 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -120,6 +120,34 @@ extern pmdval_t early_pmd_flags;
#define arch_end_context_switch(prev) do {} while(0)
#endif /* CONFIG_PARAVIRT_XXL */
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return native_make_pmd(v | set);
+}
+
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return native_make_pmd(v & ~clear);
+}
+
+static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
+{
+ pudval_t v = native_pud_val(pud);
+
+ return native_make_pud(v | set);
+}
+
+static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
+{
+ pudval_t v = native_pud_val(pud);
+
+ return native_make_pud(v & ~clear);
+}
+
/*
* The following only work if pte_present() is true.
* Undefined behaviour if not..
@@ -140,6 +168,11 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
+static inline bool pte_decrypted(pte_t pte)
+{
+ return cc_mkdec(pte_val(pte)) == pte_val(pte);
+}
+
#define pmd_dirty pmd_dirty
static inline bool pmd_dirty(pmd_t pmd)
{
@@ -169,6 +202,13 @@ static inline int pud_young(pud_t pud)
return pud_flags(pud) & _PAGE_ACCESSED;
}
+static inline bool pud_shstk(pud_t pud)
+{
+ return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ (pud_flags(pud) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) ==
+ (_PAGE_DIRTY | _PAGE_PSE);
+}
+
static inline int pte_write(pte_t pte)
{
/*
@@ -234,6 +274,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}
+#define pud_pfn pud_pfn
static inline unsigned long pud_pfn(pud_t pud)
{
phys_addr_t pfn = pud_val(pud);
@@ -304,6 +345,30 @@ static inline int pud_devmap(pud_t pud)
}
#endif
+#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
+static inline bool pmd_special(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_SPECIAL;
+}
+
+static inline pmd_t pmd_mkspecial(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_SPECIAL);
+}
+#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */
+
+#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
+static inline bool pud_special(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_SPECIAL;
+}
+
+static inline pud_t pud_mkspecial(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_SPECIAL);
+}
+#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
+
static inline int pgd_devmap(pgd_t pgd)
{
return 0;
@@ -387,23 +452,7 @@ static inline pte_t pte_wrprotect(pte_t pte)
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
- bool wp = pte_flags(pte) & _PAGE_UFFD_WP;
-
-#ifdef CONFIG_DEBUG_VM
- /*
- * Having write bit for wr-protect-marked present ptes is fatal,
- * because it means the uffd-wp bit will be ignored and write will
- * just go through.
- *
- * Use any chance of pgtable walking to verify this (e.g., when
- * page swapped out or being migrated for all purposes). It means
- * something is already wrong. Tell the admin even before the
- * process crashes. We also nail it with wrong pgtable setup.
- */
- WARN_ON_ONCE(wp && pte_write(pte));
-#endif
-
- return wp;
+ return pte_flags(pte) & _PAGE_UFFD_WP;
}
static inline pte_t pte_mkuffd_wp(pte_t pte)
@@ -490,20 +539,6 @@ static inline pte_t pte_mkdevmap(pte_t pte)
return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
}
-static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
-{
- pmdval_t v = native_pmd_val(pmd);
-
- return native_make_pmd(v | set);
-}
-
-static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
-{
- pmdval_t v = native_pmd_val(pmd);
-
- return native_make_pmd(v & ~clear);
-}
-
/* See comments above mksaveddirty_shift() */
static inline pmd_t pmd_mksaveddirty(pmd_t pmd)
{
@@ -598,20 +633,6 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
#define pmd_mkwrite pmd_mkwrite
-static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
-{
- pudval_t v = native_pud_val(pud);
-
- return native_make_pud(v | set);
-}
-
-static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
-{
- pudval_t v = native_pud_val(pud);
-
- return native_make_pud(v & ~clear);
-}
-
/* See comments above mksaveddirty_shift() */
static inline pud_t pud_mksaveddirty(pud_t pud)
{
@@ -790,6 +811,12 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd)
__pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}
+static inline pud_t pud_mkinvalid(pud_t pud)
+{
+ return pfn_pud(pud_pfn(pud),
+ __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
+}
+
static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
@@ -837,14 +864,8 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
pmd_result = __pmd(val);
/*
- * To avoid creating Write=0,Dirty=1 PMDs, pte_modify() needs to avoid:
- * 1. Marking Write=0 PMDs Dirty=1
- * 2. Marking Dirty=1 PMDs Write=0
- *
- * The first case cannot happen because the _PAGE_CHG_MASK will filter
- * out any Dirty bit passed in newprot. Handle the second case by
- * going through the mksaveddirty exercise. Only do this if the old
- * value was Write=1 to avoid doing this on Shadow Stack PTEs.
+ * Avoid creating shadow stack PMD by accident. See comment in
+ * pte_modify().
*/
if (oldval & _PAGE_RW)
pmd_result = pmd_mksaveddirty(pmd_result);
@@ -854,6 +875,29 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
return pmd_result;
}
+static inline pud_t pud_modify(pud_t pud, pgprot_t newprot)
+{
+ pudval_t val = pud_val(pud), oldval = val;
+ pud_t pud_result;
+
+ val &= _HPAGE_CHG_MASK;
+ val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+ val = flip_protnone_guard(oldval, val, PHYSICAL_PUD_PAGE_MASK);
+
+ pud_result = __pud(val);
+
+ /*
+ * Avoid creating shadow stack PUD by accident. See comment in
+ * pte_modify().
+ */
+ if (oldval & _PAGE_RW)
+ pud_result = pud_mksaveddirty(pud_result);
+ else
+ pud_result = pud_clear_saveddirty(pud_result);
+
+ return pud_result;
+}
+
/*
* mprotect needs to preserve PAT and encryption bits when updating
* vm_page_prot
@@ -1088,8 +1132,7 @@ static inline pmd_t *pud_pgtable(pud_t pud)
#define pud_leaf pud_leaf
static inline bool pud_leaf(pud_t pud)
{
- return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
- (_PAGE_PSE | _PAGE_PRESENT);
+ return pud_val(pud) & _PAGE_PSE;
}
static inline int pud_bad(pud_t pud)
@@ -1200,7 +1243,6 @@ static inline int pgd_none(pgd_t pgd)
extern int direct_gbpages;
void init_mem_mapping(void);
void early_alloc_pgt_buf(void);
-extern void memblock_find_dma_reserve(void);
void __init poking_init(void);
unsigned long init_memory_mapping(unsigned long start,
unsigned long end, pgprot_t prot);
@@ -1394,10 +1436,28 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
}
#endif
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline pud_t pudp_establish(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp, pud_t pud)
+{
+ page_table_check_pud_set(vma->vm_mm, pudp, pud);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ return xchg(pudp, pud);
+ } else {
+ pud_t old = *pudp;
+ WRITE_ONCE(*pudp, pud);
+ return old;
+ }
+}
+#endif
+
#define __HAVE_ARCH_PMDP_INVALIDATE_AD
extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
+pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp);
+
/*
* Page table pages are page-aligned. The lower half of the top
* level is used for userspace and the top half for the kernel.
@@ -1679,6 +1739,9 @@ void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte);
#define arch_check_zapped_pmd arch_check_zapped_pmd
void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd);
+#define arch_check_zapped_pud arch_check_zapped_pud
+void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud);
+
#ifdef CONFIG_XEN_PV
#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
static inline bool arch_has_hw_nonleaf_pmd_young(void)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 7e9db77231ac..d1426b64c1b9 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -270,5 +270,26 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
#include <asm/pgtable-invert.h>
-#endif /* !__ASSEMBLY__ */
+#else /* __ASSEMBLY__ */
+
+#define l4_index(x) (((x) >> 39) & 511)
+#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+
+L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
+L4_START_KERNEL = l4_index(__START_KERNEL_map)
+
+L3_START_KERNEL = pud_index(__START_KERNEL_map)
+
+#define SYM_DATA_START_PAGE_ALIGNED(name) \
+ SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
+
+/* Automate the creation of 1 to 1 mapping pmd entries */
+#define PMDS(START, PERM, COUNT) \
+ i = 0 ; \
+ .rept (COUNT) ; \
+ .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
+ i = i + 1 ; \
+ .endr
+
+#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 9053dfe9fa03..a98e53491a4e 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d;
# define VMEMMAP_START __VMEMMAP_BASE_L4
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
+#ifdef CONFIG_RANDOMIZE_MEMORY
+# define PHYSMEM_END physmem_end
+#endif
+
/*
* End of the region for which vmalloc page tables are pre-allocated.
* For non-KMSAN builds, this is the same as VMALLOC_END.
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 9abb8cc4cd47..6f82e75b6149 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -517,8 +517,6 @@ typedef struct page *pgtable_t;
extern pteval_t __supported_pte_mask;
extern pteval_t __default_kernel_pte_mask;
-extern void set_nx(void);
-extern int nx_enabled;
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);
@@ -549,6 +547,7 @@ enum pg_level {
PG_LEVEL_2M,
PG_LEVEL_1G,
PG_LEVEL_512G,
+ PG_LEVEL_256T,
PG_LEVEL_NUM
};
@@ -567,6 +566,8 @@ static inline void update_page_count(int level, unsigned long pages) { }
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
+pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
+ unsigned int *level, bool *nx, bool *rw);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h
new file mode 100644
index 000000000000..de788b400fba
--- /dev/null
+++ b/arch/x86/include/asm/posted_intr.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_POSTED_INTR_H
+#define _X86_POSTED_INTR_H
+#include <asm/irq_vectors.h>
+
+#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
+#define PID_TABLE_ENTRY_VALID 1
+
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+ union {
+ u32 pir[8]; /* Posted interrupt requested */
+ u64 pir64[4];
+ };
+ union {
+ struct {
+ u16 notifications; /* Suppress and outstanding bits */
+ u8 nv;
+ u8 rsvd_2;
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
+} __aligned(64);
+
+static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
+static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
+{
+ return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_on(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+/* Non-atomic helpers */
+static inline void __pi_set_sn(struct pi_desc *pi_desc)
+{
+ pi_desc->notifications |= BIT(POSTED_INTR_SN);
+}
+
+static inline void __pi_clear_sn(struct pi_desc *pi_desc)
+{
+ pi_desc->notifications &= ~BIT(POSTED_INTR_SN);
+}
+
+#ifdef CONFIG_X86_POSTED_MSI
+/*
+ * Not all external vectors are subject to interrupt remapping, e.g. IOMMU's
+ * own interrupts. Here we do not distinguish them since those vector bits in
+ * PIR will always be zero.
+ */
+static inline bool pi_pending_this_cpu(unsigned int vector)
+{
+ struct pi_desc *pid = this_cpu_ptr(&posted_msi_pi_desc);
+
+ if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR))
+ return false;
+
+ return test_bit(vector, (unsigned long *)pid->pir);
+}
+
+extern void intel_posted_msi_init(void);
+#else
+static inline bool pi_pending_this_cpu(unsigned int vector) { return false; }
+
+static inline void intel_posted_msi_init(void) {};
+#endif /* X86_POSTED_MSI */
+
+#endif /* _X86_POSTED_INTR_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 78e51b0d6433..4a686f0e5dbf 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -108,9 +108,23 @@ struct cpuinfo_topology {
};
struct cpuinfo_x86 {
- __u8 x86; /* CPU family */
- __u8 x86_vendor; /* CPU vendor */
- __u8 x86_model;
+ union {
+ /*
+ * The particular ordering (low-to-high) of (vendor,
+ * family, model) is done in case range of models, like
+ * it is usually done on AMD, need to be compared.
+ */
+ struct {
+ __u8 x86_model;
+ /* CPU family */
+ __u8 x86;
+ /* CPU vendor */
+ __u8 x86_vendor;
+ __u8 x86_reserved;
+ };
+ /* combined vendor, family, model */
+ __u32 x86_vfm;
+ };
__u8 x86_stepping;
#ifdef CONFIG_X86_64
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
@@ -568,7 +582,8 @@ extern void switch_gdt_and_percpu_base(int);
extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
extern void cpu_init(void);
-extern void cpu_init_exception_handling(void);
+extern void cpu_init_exception_handling(bool boot_cpu);
+extern void cpu_init_replace_early_idt(void);
extern void cr4_init(void);
extern void set_task_blockstep(struct task_struct *task, bool on);
@@ -586,7 +601,7 @@ extern char ignore_fpu_irq;
# define BASE_PREFETCH ""
# define ARCH_HAS_PREFETCH
#else
-# define BASE_PREFETCH "prefetcht0 %P1"
+# define BASE_PREFETCH "prefetcht0 %1"
#endif
/*
@@ -597,7 +612,7 @@ extern char ignore_fpu_irq;
*/
static inline void prefetch(const void *x)
{
- alternative_input(BASE_PREFETCH, "prefetchnta %P1",
+ alternative_input(BASE_PREFETCH, "prefetchnta %1",
X86_FEATURE_XMM,
"m" (*(const char *)x));
}
@@ -609,7 +624,7 @@ static inline void prefetch(const void *x)
*/
static __always_inline void prefetchw(const void *x)
{
- alternative_input(BASE_PREFETCH, "prefetchw %P1",
+ alternative_input(BASE_PREFETCH, "prefetchw %1",
X86_FEATURE_3DNOWPREFETCH,
"m" (*(const char *)x));
}
@@ -635,12 +650,10 @@ static __always_inline void prefetchw(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
-extern unsigned long __end_init_task[];
+extern unsigned long __top_init_kernel_stack[];
#define INIT_THREAD { \
- .sp = (unsigned long)&__end_init_task - \
- TOP_OF_KERNEL_STACK_PADDING - \
- sizeof(struct pt_regs), \
+ .sp = (unsigned long)&__top_init_kernel_stack, \
}
extern unsigned long KSTK_ESP(struct task_struct *task);
@@ -679,11 +692,18 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu)
}
#ifdef CONFIG_CPU_SUP_AMD
-extern u32 amd_get_highest_perf(void);
-extern void amd_clear_divider(void);
+/*
+ * Issue a DIV 0/1 insn to clear any division data from previous DIV
+ * operations.
+ */
+static __always_inline void amd_clear_divider(void)
+{
+ asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0)
+ :: "a" (0), "d" (0), "r" (1));
+}
+
extern void amd_check_microcode(void);
#else
-static inline u32 amd_get_highest_perf(void) { return 0; }
static inline void amd_clear_divider(void) { }
static inline void amd_check_microcode(void) { }
#endif
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 043758a2e627..365798cb4408 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -23,19 +23,14 @@ extern int of_ioapic;
extern u64 initial_dtb;
extern void add_dtb(u64 data);
void x86_of_pci_init(void);
-void x86_dtb_parse_smp_config(void);
+void x86_flattree_get_config(void);
#else
static inline void add_dtb(u64 data) { }
static inline void x86_of_pci_init(void) { }
-static inline void x86_dtb_parse_smp_config(void) { }
+static inline void x86_flattree_get_config(void) { }
#define of_ioapic 0
#endif
-#ifdef CONFIG_OF_EARLY_FLATTREE
-void x86_flattree_get_config(void);
-#else
-static inline void x86_flattree_get_config(void) { }
-#endif
extern char cmd_line[COMMAND_LINE_SIZE];
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 0c92db84469d..6e4f8fae3ce9 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -5,6 +5,7 @@
#include <asm/clocksource.h>
#include <asm/pvclock-abi.h>
+struct timespec64;
/* some helper functions for xen and kvm pv clock sources */
u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src);
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index cde8357bb226..68da67df304d 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -66,13 +66,15 @@ static inline bool vcpu_is_preempted(long cpu)
#ifdef CONFIG_PARAVIRT
/*
- * virt_spin_lock_key - enables (by default) the virt_spin_lock() hijack.
+ * virt_spin_lock_key - disables by default the virt_spin_lock() hijack.
*
- * Native (and PV wanting native due to vCPU pinning) should disable this key.
- * It is done in this backwards fashion to only have a single direction change,
- * which removes ordering between native_pv_spin_init() and HV setup.
+ * Native (and PV wanting native due to vCPU pinning) should keep this key
+ * disabled. Native does not touch the key.
+ *
+ * When in a guest then native_pv_lock_init() enables the key first and
+ * KVM/XEN might conditionally disable it later in the boot process again.
*/
-DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key);
+DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
/*
* Shortcut for the queued_spin_lock_slowpath() function that allows
@@ -85,6 +87,8 @@ DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key);
#define virt_spin_lock virt_spin_lock
static inline bool virt_spin_lock(struct qspinlock *lock)
{
+ int val;
+
if (!static_branch_likely(&virt_spin_lock_key))
return false;
@@ -94,10 +98,13 @@ static inline bool virt_spin_lock(struct qspinlock *lock)
* horrible lock 'holder' preemption issues.
*/
- do {
- while (atomic_read(&lock->val) != 0)
- cpu_relax();
- } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0);
+ __retry:
+ val = atomic_read(&lock->val);
+
+ if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) {
+ cpu_relax();
+ goto __retry;
+ }
return true;
}
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index ef9697f20129..0a985784be9b 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -25,9 +25,9 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
*
* void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock)
* {
- * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0);
+ * u8 lockval = _Q_LOCKED_VAL;
*
- * if (likely(lockval == _Q_LOCKED_VAL))
+ * if (try_cmpxchg(&lock->locked, &lockval, 0))
* return;
* pv_queued_spin_unlock_slowpath(lock, lockval);
* }
@@ -40,10 +40,9 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
#define PV_UNLOCK_ASM \
FRAME_BEGIN \
"push %rdx\n\t" \
- "mov $0x1,%eax\n\t" \
+ "mov $" __stringify(_Q_LOCKED_VAL) ",%eax\n\t" \
"xor %edx,%edx\n\t" \
LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \
- "cmp $0x1,%al\n\t" \
"jne .slowpath\n\t" \
"pop %rdx\n\t" \
FRAME_END \
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 6536873f8fc0..ecd58ea9a837 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,14 +25,16 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
-#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
typedef void (cpu_emergency_virt_cb)(void);
+#if IS_ENABLED(CONFIG_KVM_X86)
void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback);
void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback);
void cpu_emergency_disable_virtualization(void);
#else
+static inline void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) {}
+static inline void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) {}
static inline void cpu_emergency_disable_virtualization(void) {}
-#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */
+#endif /* CONFIG_KVM_X86 */
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 12dbd2588ca7..8b1b6ce1e51b 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -156,12 +156,6 @@ static inline void resctrl_sched_in(struct task_struct *tsk)
__resctrl_sched_in(tsk);
}
-static inline u32 resctrl_arch_system_num_rmid_idx(void)
-{
- /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
- return boot_cpu_data.x86_cache_max_rmid + 1;
-}
-
static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
{
*rmid = idx;
diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h
new file mode 100644
index 000000000000..24e3a53ca255
--- /dev/null
+++ b/arch/x86/include/asm/runtime-const.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_RUNTIME_CONST_H
+#define _ASM_RUNTIME_CONST_H
+
+#define runtime_const_ptr(sym) ({ \
+ typeof(sym) __ret; \
+ asm_inline("mov %1,%0\n1:\n" \
+ ".pushsection runtime_ptr_" #sym ",\"a\"\n\t" \
+ ".long 1b - %c2 - .\n\t" \
+ ".popsection" \
+ :"=r" (__ret) \
+ :"i" ((unsigned long)0x0123456789abcdefull), \
+ "i" (sizeof(long))); \
+ __ret; })
+
+// The 'typeof' will create at _least_ a 32-bit type, but
+// will happily also take a bigger type and the 'shrl' will
+// clear the upper bits
+#define runtime_const_shift_right_32(val, sym) ({ \
+ typeof(0u+(val)) __ret = (val); \
+ asm_inline("shrl $12,%k0\n1:\n" \
+ ".pushsection runtime_shift_" #sym ",\"a\"\n\t" \
+ ".long 1b - 1 - .\n\t" \
+ ".popsection" \
+ :"+r" (__ret)); \
+ __ret; })
+
+#define runtime_const_init(type, sym) do { \
+ extern s32 __start_runtime_##type##_##sym[]; \
+ extern s32 __stop_runtime_##type##_##sym[]; \
+ runtime_const_fixup(__runtime_fixup_##type, \
+ (unsigned long)(sym), \
+ __start_runtime_##type##_##sym, \
+ __stop_runtime_##type##_##sym); \
+} while (0)
+
+/*
+ * The text patching is trivial - you can only do this at init time,
+ * when the text section hasn't been marked RO, and before the text
+ * has ever been executed.
+ */
+static inline void __runtime_fixup_ptr(void *where, unsigned long val)
+{
+ *(unsigned long *)where = val;
+}
+
+static inline void __runtime_fixup_shift(void *where, unsigned long val)
+{
+ *(unsigned char *)where = val;
+}
+
+static inline void runtime_const_fixup(void (*fn)(void *, unsigned long),
+ unsigned long val, s32 *start, s32 *end)
+{
+ while (start < end) {
+ fn(*start + (void *)start, val);
+ start++;
+ }
+}
+
+#endif
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index fef16e398161..42bcd42d70d1 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -9,7 +9,7 @@
#endif
#ifdef CONFIG_COMPAT
-#include <asm/ia32_unistd.h>
+#include <asm/unistd_32_ia32.h>
#define __NR_seccomp_read_32 __NR_ia32_read
#define __NR_seccomp_write_32 __NR_ia32_write
#define __NR_seccomp_exit_32 __NR_ia32_exit
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 9aee31862b4a..4b2abce2e3e7 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -49,8 +49,11 @@ int set_memory_wb(unsigned long addr, int numpages);
int set_memory_np(unsigned long addr, int numpages);
int set_memory_p(unsigned long addr, int numpages);
int set_memory_4k(unsigned long addr, int numpages);
+
+bool set_memory_enc_stop_conversion(void);
int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages);
+
int set_memory_np_noalias(unsigned long addr, int numpages);
int set_memory_nonglobal(unsigned long addr, int numpages);
int set_memory_global(unsigned long addr, int numpages);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index e61e68d71cba..0667b2a88614 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -28,6 +28,8 @@
#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
#ifndef __ASSEMBLY__
+#include <linux/cache.h>
+
#include <asm/bootparam.h>
#include <asm/x86_init.h>
@@ -133,6 +135,12 @@ asmlinkage void __init __noreturn x86_64_start_reservations(char *real_mode_data
#endif /* __i386__ */
#endif /* _SETUP */
+#ifdef CONFIG_CMDLINE_BOOL
+extern bool builtin_cmdline_added __ro_after_init;
+#else
+#define builtin_cmdline_added 0
+#endif
+
#else /* __ASSEMBLY */
.macro __RESERVE_BRK name, size
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index b463fcbd4b90..98726c2b04f8 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -54,8 +54,18 @@
(((unsigned long)fn) << 32))
/* AP Reset Hold */
-#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
-#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
+#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+#define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12
+#define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0)
+
+/* Preferred GHCB GPA Request */
+#define GHCB_MSR_PREF_GPA_REQ 0x010
+#define GHCB_MSR_GPA_VALUE_POS 12
+#define GHCB_MSR_GPA_VALUE_MASK GENMASK_ULL(51, 0)
+
+#define GHCB_MSR_PREF_GPA_RESP 0x011
+#define GHCB_MSR_PREF_GPA_NONE 0xfffffffffffff
/* GHCB GPA Register */
#define GHCB_MSR_REG_GPA_REQ 0x012
@@ -91,28 +101,61 @@ enum psc_op {
/* GHCBData[11:0] */ \
GHCB_MSR_PSC_REQ)
+#define GHCB_MSR_PSC_REQ_TO_GFN(msr) (((msr) & GENMASK_ULL(51, 12)) >> 12)
+#define GHCB_MSR_PSC_REQ_TO_OP(msr) (((msr) & GENMASK_ULL(55, 52)) >> 52)
+
#define GHCB_MSR_PSC_RESP 0x015
#define GHCB_MSR_PSC_RESP_VAL(val) \
/* GHCBData[63:32] */ \
(((u64)(val) & GENMASK_ULL(63, 32)) >> 32)
+/* Set highest bit as a generic error response */
+#define GHCB_MSR_PSC_RESP_ERROR (BIT_ULL(63) | GHCB_MSR_PSC_RESP)
+
+/* GHCB Run at VMPL Request/Response */
+#define GHCB_MSR_VMPL_REQ 0x016
+#define GHCB_MSR_VMPL_REQ_LEVEL(v) \
+ /* GHCBData[39:32] */ \
+ (((u64)(v) & GENMASK_ULL(7, 0) << 32) | \
+ /* GHCBDdata[11:0] */ \
+ GHCB_MSR_VMPL_REQ)
+
+#define GHCB_MSR_VMPL_RESP 0x017
+#define GHCB_MSR_VMPL_RESP_VAL(v) \
+ /* GHCBData[63:32] */ \
+ (((u64)(v) & GENMASK_ULL(63, 32)) >> 32)
+
/* GHCB Hypervisor Feature Request/Response */
#define GHCB_MSR_HV_FT_REQ 0x080
#define GHCB_MSR_HV_FT_RESP 0x081
+#define GHCB_MSR_HV_FT_POS 12
+#define GHCB_MSR_HV_FT_MASK GENMASK_ULL(51, 0)
#define GHCB_MSR_HV_FT_RESP_VAL(v) \
/* GHCBData[63:12] */ \
(((u64)(v) & GENMASK_ULL(63, 12)) >> 12)
#define GHCB_HV_FT_SNP BIT_ULL(0)
#define GHCB_HV_FT_SNP_AP_CREATION BIT_ULL(1)
+#define GHCB_HV_FT_SNP_MULTI_VMPL BIT_ULL(5)
/*
* SNP Page State Change NAE event
* The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which
* is a local stack variable in set_pages_state(). Do not increase this value
* without evaluating the impact to stack usage.
+ *
+ * Use VMGEXIT_PSC_MAX_COUNT in cases where the actual GHCB-defined max value
+ * is needed, such as when processing GHCB requests on the hypervisor side.
*/
#define VMGEXIT_PSC_MAX_ENTRY 64
+#define VMGEXIT_PSC_MAX_COUNT 253
+
+#define VMGEXIT_PSC_ERROR_GENERIC (0x100UL << 32)
+#define VMGEXIT_PSC_ERROR_INVALID_HDR ((1UL << 32) | 1)
+#define VMGEXIT_PSC_ERROR_INVALID_ENTRY ((1UL << 32) | 2)
+
+#define VMGEXIT_PSC_OP_PRIVATE 1
+#define VMGEXIT_PSC_OP_SHARED 2
struct psc_hdr {
u16 cur_entry;
@@ -159,6 +202,10 @@ struct snp_psc_desc {
#define GHCB_TERM_NOT_VMPL0 3 /* SNP guest is not running at VMPL-0 */
#define GHCB_TERM_CPUID 4 /* CPUID-validation failure */
#define GHCB_TERM_CPUID_HV 5 /* CPUID failure during hypervisor fallback */
+#define GHCB_TERM_SECRETS_PAGE 6 /* Secrets page failure */
+#define GHCB_TERM_NO_SVSM 7 /* SVSM is not advertised in the secrets page */
+#define GHCB_TERM_SVSM_VMPL0 8 /* SVSM is present but has set VMPL to 0 */
+#define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */
#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 93ed60080cfe..ee34ab00a8d6 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -91,6 +91,9 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
/* RMUPDATE detected 4K page and 2MB page overlap. */
#define RMPUPDATE_FAIL_OVERLAP 4
+/* PSMASH failed due to concurrent access by another CPU */
+#define PSMASH_FAIL_INUSE 3
+
/* RMP page size */
#define RMP_PG_SIZE_4K 0
#define RMP_PG_SIZE_2M 1
@@ -116,6 +119,54 @@ struct snp_req_data {
unsigned int data_npages;
};
+#define MAX_AUTHTAG_LEN 32
+
+/* See SNP spec SNP_GUEST_REQUEST section for the structure */
+enum msg_type {
+ SNP_MSG_TYPE_INVALID = 0,
+ SNP_MSG_CPUID_REQ,
+ SNP_MSG_CPUID_RSP,
+ SNP_MSG_KEY_REQ,
+ SNP_MSG_KEY_RSP,
+ SNP_MSG_REPORT_REQ,
+ SNP_MSG_REPORT_RSP,
+ SNP_MSG_EXPORT_REQ,
+ SNP_MSG_EXPORT_RSP,
+ SNP_MSG_IMPORT_REQ,
+ SNP_MSG_IMPORT_RSP,
+ SNP_MSG_ABSORB_REQ,
+ SNP_MSG_ABSORB_RSP,
+ SNP_MSG_VMRK_REQ,
+ SNP_MSG_VMRK_RSP,
+
+ SNP_MSG_TYPE_MAX
+};
+
+enum aead_algo {
+ SNP_AEAD_INVALID,
+ SNP_AEAD_AES_256_GCM,
+};
+
+struct snp_guest_msg_hdr {
+ u8 authtag[MAX_AUTHTAG_LEN];
+ u64 msg_seqno;
+ u8 rsvd1[8];
+ u8 algo;
+ u8 hdr_version;
+ u16 hdr_sz;
+ u8 msg_type;
+ u8 msg_version;
+ u16 msg_sz;
+ u32 rsvd2;
+ u8 msg_vmpck;
+ u8 rsvd3[35];
+} __packed;
+
+struct snp_guest_msg {
+ struct snp_guest_msg_hdr hdr;
+ u8 payload[PAGE_SIZE - sizeof(struct snp_guest_msg_hdr)];
+} __packed;
+
struct sev_guest_platform_data {
u64 secrets_gpa;
};
@@ -140,7 +191,7 @@ struct secrets_os_area {
#define VMPCK_KEY_LEN 32
/* See the SNP spec version 0.9 for secrets page format */
-struct snp_secrets_page_layout {
+struct snp_secrets_page {
u32 version;
u32 imien : 1,
rsvd1 : 31;
@@ -152,10 +203,119 @@ struct snp_secrets_page_layout {
u8 vmpck2[VMPCK_KEY_LEN];
u8 vmpck3[VMPCK_KEY_LEN];
struct secrets_os_area os_area;
- u8 rsvd3[3840];
+
+ u8 vmsa_tweak_bitmap[64];
+
+ /* SVSM fields */
+ u64 svsm_base;
+ u64 svsm_size;
+ u64 svsm_caa;
+ u32 svsm_max_version;
+ u8 svsm_guest_vmpl;
+ u8 rsvd3[3];
+
+ /* Remainder of page */
+ u8 rsvd4[3744];
} __packed;
+/*
+ * The SVSM Calling Area (CA) related structures.
+ */
+struct svsm_ca {
+ u8 call_pending;
+ u8 mem_available;
+ u8 rsvd1[6];
+
+ u8 svsm_buffer[PAGE_SIZE - 8];
+};
+
+#define SVSM_SUCCESS 0
+#define SVSM_ERR_INCOMPLETE 0x80000000
+#define SVSM_ERR_UNSUPPORTED_PROTOCOL 0x80000001
+#define SVSM_ERR_UNSUPPORTED_CALL 0x80000002
+#define SVSM_ERR_INVALID_ADDRESS 0x80000003
+#define SVSM_ERR_INVALID_FORMAT 0x80000004
+#define SVSM_ERR_INVALID_PARAMETER 0x80000005
+#define SVSM_ERR_INVALID_REQUEST 0x80000006
+#define SVSM_ERR_BUSY 0x80000007
+#define SVSM_PVALIDATE_FAIL_SIZEMISMATCH 0x80001006
+
+/*
+ * The SVSM PVALIDATE related structures
+ */
+struct svsm_pvalidate_entry {
+ u64 page_size : 2,
+ action : 1,
+ ignore_cf : 1,
+ rsvd : 8,
+ pfn : 52;
+};
+
+struct svsm_pvalidate_call {
+ u16 num_entries;
+ u16 cur_index;
+
+ u8 rsvd1[4];
+
+ struct svsm_pvalidate_entry entry[];
+};
+
+#define SVSM_PVALIDATE_MAX_COUNT ((sizeof_field(struct svsm_ca, svsm_buffer) - \
+ offsetof(struct svsm_pvalidate_call, entry)) / \
+ sizeof(struct svsm_pvalidate_entry))
+
+/*
+ * The SVSM Attestation related structures
+ */
+struct svsm_loc_entry {
+ u64 pa;
+ u32 len;
+ u8 rsvd[4];
+};
+
+struct svsm_attest_call {
+ struct svsm_loc_entry report_buf;
+ struct svsm_loc_entry nonce;
+ struct svsm_loc_entry manifest_buf;
+ struct svsm_loc_entry certificates_buf;
+
+ /* For attesting a single service */
+ u8 service_guid[16];
+ u32 service_manifest_ver;
+ u8 rsvd[4];
+};
+
+/*
+ * SVSM protocol structure
+ */
+struct svsm_call {
+ struct svsm_ca *caa;
+ u64 rax;
+ u64 rcx;
+ u64 rdx;
+ u64 r8;
+ u64 r9;
+ u64 rax_out;
+ u64 rcx_out;
+ u64 rdx_out;
+ u64 r8_out;
+ u64 r9_out;
+};
+
+#define SVSM_CORE_CALL(x) ((0ULL << 32) | (x))
+#define SVSM_CORE_REMAP_CA 0
+#define SVSM_CORE_PVALIDATE 1
+#define SVSM_CORE_CREATE_VCPU 2
+#define SVSM_CORE_DELETE_VCPU 3
+
+#define SVSM_ATTEST_CALL(x) ((1ULL << 32) | (x))
+#define SVSM_ATTEST_SERVICES 0
+#define SVSM_ATTEST_SINGLE_SERVICE 1
+
#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+extern u8 snp_vmpl;
+
extern void __sev_es_ist_enter(struct pt_regs *regs);
extern void __sev_es_ist_exit(void);
static __always_inline void sev_es_ist_enter(struct pt_regs *regs)
@@ -181,6 +341,14 @@ static __always_inline void sev_es_nmi_complete(void)
extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
extern void sev_enable(struct boot_params *bp);
+/*
+ * RMPADJUST modifies the RMP permissions of a page of a lesser-
+ * privileged (numerically higher) VMPL.
+ *
+ * If the guest is running at a higher-privilege than the privilege
+ * level the instruction is targeting, the instruction will succeed,
+ * otherwise, it will fail.
+ */
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs)
{
int rc;
@@ -225,11 +393,16 @@ bool snp_init(struct boot_params *bp);
void __noreturn snp_abort(void);
void snp_dmi_setup(void);
int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
+int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
u64 snp_get_unsupported_features(u64 status);
u64 sev_get_status(void);
void sev_show_status(void);
-#else
+void snp_update_svsm_ca(void);
+
+#else /* !CONFIG_AMD_MEM_ENCRYPT */
+
+#define snp_vmpl 0
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
@@ -253,12 +426,17 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in
{
return -ENOTTY;
}
-
+static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input)
+{
+ return -ENOTTY;
+}
static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
static inline u64 sev_get_status(void) { return 0; }
static inline void sev_show_status(void) { }
-#endif
+static inline void snp_update_svsm_ca(void) { }
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
#ifdef CONFIG_KVM_AMD_SEV
bool snp_probe_rmptable_info(void);
diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
index 42fee8959df7..4cb77e004615 100644
--- a/arch/x86/include/asm/shstk.h
+++ b/arch/x86/include/asm/shstk.h
@@ -21,6 +21,8 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clon
void shstk_free(struct task_struct *p);
int setup_signal_shadow_stack(struct ksignal *ksig);
int restore_signal_shadow_stack(void);
+int shstk_update_last_frame(unsigned long val);
+bool shstk_is_enabled(void);
#else
static inline long shstk_prctl(struct task_struct *task, int option,
unsigned long arg2) { return -EINVAL; }
@@ -31,6 +33,8 @@ static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p,
static inline void shstk_free(struct task_struct *p) {}
static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
static inline int restore_signal_shadow_stack(void) { return 0; }
+static inline int shstk_update_last_frame(unsigned long val) { return 0; }
+static inline bool shstk_is_enabled(void) { return false; }
#endif /* CONFIG_X86_USER_SHADOW_STACK */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index a35936b512fe..ca073f40698f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -35,6 +35,7 @@ struct smp_ops {
int (*cpu_disable)(void);
void (*cpu_die)(unsigned int cpu);
void (*play_dead)(void);
+ void (*stop_this_cpu)(void);
void (*send_call_func_ipi)(const struct cpumask *mask);
void (*send_call_func_single_ipi)(int cpu);
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 1be13b2dfe8b..3918c7a434f5 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -31,15 +31,4 @@
#endif /* CONFIG_SPARSEMEM */
-#ifndef __ASSEMBLY__
-#ifdef CONFIG_NUMA_KEEP_MEMINFO
-extern int phys_to_target_node(phys_addr_t start);
-#define phys_to_target_node phys_to_target_node
-extern int memory_add_physaddr_to_nid(u64 start);
-#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
-extern int numa_fill_memblks(u64 start, u64 end);
-#define numa_fill_memblks numa_fill_memblks
-#endif
-#endif /* __ASSEMBLY__ */
-
#endif /* _ASM_X86_SPARSEMEM_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2e9fc5c400cd..aec6e2d3aa1d 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -182,8 +182,8 @@ static __always_inline void clflush(volatile void *__p)
static inline void clflushopt(volatile void *__p)
{
- alternative_io(".byte 0x3e; clflush %P0",
- ".byte 0x66; clflush %P0",
+ alternative_io(".byte 0x3e; clflush %0",
+ ".byte 0x66; clflush %0",
X86_FEATURE_CLFLUSHOPT,
"+m" (*(volatile char __force *)__p));
}
@@ -205,9 +205,9 @@ static inline void clwb(volatile void *__p)
#ifdef CONFIG_X86_USER_SHADOW_STACK
static inline int write_user_shstk_64(u64 __user *addr, u64 val)
{
- asm goto("1: wrussq %[val], (%[addr])\n"
+ asm goto("1: wrussq %[val], %[addr]\n"
_ASM_EXTABLE(1b, %l[fail])
- :: [addr] "r" (addr), [val] "r" (val)
+ :: [addr] "m" (*addr), [val] "r" (val)
:: fail);
return 0;
fail:
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 857d364b9888..9d0b324eab21 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -30,37 +30,40 @@ void *__memset(void *s, int c, size_t n);
#define __HAVE_ARCH_MEMSET16
static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
{
- long d0, d1;
- asm volatile("rep\n\t"
- "stosw"
- : "=&c" (d0), "=&D" (d1)
- : "a" (v), "1" (s), "0" (n)
- : "memory");
- return s;
+ const __auto_type s0 = s;
+ asm volatile (
+ "rep stosw"
+ : "+D" (s), "+c" (n)
+ : "a" (v)
+ : "memory"
+ );
+ return s0;
}
#define __HAVE_ARCH_MEMSET32
static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
{
- long d0, d1;
- asm volatile("rep\n\t"
- "stosl"
- : "=&c" (d0), "=&D" (d1)
- : "a" (v), "1" (s), "0" (n)
- : "memory");
- return s;
+ const __auto_type s0 = s;
+ asm volatile (
+ "rep stosl"
+ : "+D" (s), "+c" (n)
+ : "a" (v)
+ : "memory"
+ );
+ return s0;
}
#define __HAVE_ARCH_MEMSET64
static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
{
- long d0, d1;
- asm volatile("rep\n\t"
- "stosq"
- : "=&c" (d0), "=&D" (d1)
- : "a" (v), "1" (s), "0" (n)
- : "memory");
- return s;
+ const __auto_type s0 = s;
+ asm volatile (
+ "rep stosq"
+ : "+D" (s), "+c" (n)
+ : "a" (v)
+ : "memory"
+ );
+ return s0;
}
#endif
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 728c98175b9c..2b59b9951c90 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -285,7 +285,14 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
+#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0)
+#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3)
+#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
+#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
+
+#define SVM_SEV_FEAT_INT_INJ_MODES \
+ (SVM_SEV_FEAT_RESTRICTED_INJECTION | \
+ SVM_SEV_FEAT_ALTERNATE_INJECTION)
struct vmcb_seg {
u16 selector;
@@ -509,6 +516,20 @@ struct ghcb {
u32 ghcb_usage;
} __packed;
+struct vmcb {
+ struct vmcb_control_area control;
+ union {
+ struct vmcb_save_area save;
+
+ /*
+ * For SEV-ES VMs, the save area in the VMCB is used only to
+ * save/load host state. Guest state resides in a separate
+ * page, the aptly named VM Save Area (VMSA), that is encrypted
+ * with the guest's private key.
+ */
+ struct sev_es_save_area host_sev_es_save;
+ };
+} __packed;
#define EXPECTED_VMCB_SAVE_AREA_SIZE 744
#define EXPECTED_GHCB_SAVE_AREA_SIZE 1032
@@ -525,6 +546,7 @@ static inline void __unused_size_checks(void)
BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
+ BUILD_BUG_ON(offsetof(struct vmcb, save) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE);
/* Check offsets of reserved fields */
@@ -561,11 +583,6 @@ static inline void __unused_size_checks(void)
BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0);
}
-struct vmcb {
- struct vmcb_control_area control;
- struct vmcb_save_area save;
-} __packed;
-
#define SVM_CPUID_FUNC 0x8000000a
#define SVM_SELECTOR_S_SHIFT 4
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index c3bd0c0758c9..75248546403d 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -70,13 +70,9 @@ static inline void update_task_stack(struct task_struct *task)
#ifdef CONFIG_X86_32
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
- if (cpu_feature_enabled(X86_FEATURE_FRED)) {
- /* WRMSRNS is a baseline feature for FRED. */
- wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE);
- } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) {
+ if (!cpu_feature_enabled(X86_FEATURE_FRED) && cpu_feature_enabled(X86_FEATURE_XENPV))
/* Xen PV enters the kernel on the thread stack. */
load_sp0(task_top_of_stack(task));
- }
#endif
}
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 2fc7bc3863ff..7c488ff0c764 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -82,7 +82,12 @@ static inline void syscall_get_arguments(struct task_struct *task,
struct pt_regs *regs,
unsigned long *args)
{
- memcpy(args, &regs->bx, 6 * sizeof(args[0]));
+ args[0] = regs->bx;
+ args[1] = regs->cx;
+ args[2] = regs->dx;
+ args[3] = regs->si;
+ args[4] = regs->di;
+ args[5] = regs->bp;
}
static inline int syscall_get_arch(struct task_struct *task)
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 345aafbc1964..6259f1937fe7 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -15,7 +15,7 @@
extern void text_poke_early(void *addr, const void *opcode, size_t len);
-extern void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len);
+extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len);
/*
* Clear and restore the kernel write-protection flag on the local CPU.
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 25726893c6f4..69e79fff41b8 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -399,11 +399,10 @@ static inline u64 tlbstate_lam_cr3_mask(void)
return lam << X86_CR3_LAM_U57_BIT;
}
-static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
+static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
{
- this_cpu_write(cpu_tlbstate.lam,
- mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT);
- this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask);
+ this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT);
+ this_cpu_write(tlbstate_untag_mask, untag_mask);
}
#else
@@ -413,7 +412,7 @@ static inline u64 tlbstate_lam_cr3_mask(void)
return 0;
}
-static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
+static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
{
}
#endif
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index abe3a8f22cbd..aef70336d624 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -282,9 +282,22 @@ static inline long arch_scale_freq_capacity(int cpu)
}
#define arch_scale_freq_capacity arch_scale_freq_capacity
+bool arch_enable_hybrid_capacity_scale(void);
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+ unsigned long cap_freq, unsigned long base_freq);
+
+unsigned long arch_scale_cpu_capacity(int cpu);
+#define arch_scale_cpu_capacity arch_scale_cpu_capacity
+
extern void arch_set_max_freq_ratio(bool turbo_disabled);
extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled);
#else
+static inline bool arch_enable_hybrid_capacity_scale(void) { return false; }
+static inline void arch_set_cpu_capacity(int cpu, unsigned long cap,
+ unsigned long max_cap,
+ unsigned long cap_freq,
+ unsigned long base_freq) { }
+
static inline void arch_set_max_freq_ratio(bool turbo_disabled) { }
static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { }
#endif
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 405efb3e4996..94408a784c8e 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -28,9 +28,6 @@ static inline cycles_t get_cycles(void)
}
#define get_cycles get_cycles
-extern struct system_counterval_t convert_art_to_tsc(u64 art);
-extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns);
-
extern void tsc_early_init(void);
extern void tsc_init(void);
extern void mark_tsc_unstable(char *reason);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 237dc8cdd12b..3a7755c1a441 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -78,10 +78,10 @@ extern int __get_user_bad(void);
int __ret_gu; \
register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \
__chk_user_ptr(ptr); \
- asm volatile("call __" #fn "_%P4" \
+ asm volatile("call __" #fn "_%c[size]" \
: "=a" (__ret_gu), "=r" (__val_gu), \
ASM_CALL_CONSTRAINT \
- : "0" (ptr), "i" (sizeof(*(ptr)))); \
+ : "0" (ptr), [size] "i" (sizeof(*(ptr)))); \
instrument_get_user(__val_gu); \
(x) = (__force __typeof__(*(ptr))) __val_gu; \
__builtin_expect(__ret_gu, 0); \
@@ -177,7 +177,7 @@ extern void __put_user_nocheck_8(void);
__chk_user_ptr(__ptr); \
__ptr_pu = __ptr; \
__val_pu = __x; \
- asm volatile("call __" #fn "_%P[size]" \
+ asm volatile("call __" #fn "_%c[size]" \
: "=c" (__ret_pu), \
ASM_CALL_CONSTRAINT \
: "0" (__ptr_pu), \
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 04789f45ab2b..afce8ee5d7b7 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -54,6 +54,17 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
#define valid_user_address(x) ((__force long)(x) >= 0)
/*
+ * Masking the user address is an alternative to a conditional
+ * user_access_begin that can avoid the fencing. This only works
+ * for dense accesses starting at the address.
+ */
+#define mask_user_address(x) ((typeof(x))((long)(x)|((long)(x)>>63)))
+#define masked_user_access_begin(x) ({ \
+ __auto_type __masked_ptr = (x); \
+ __masked_ptr = mask_user_address(__masked_ptr); \
+ __uaccess_begin(); __masked_ptr; })
+
+/*
* User pointers can have tag bits on x86-64. This scheme tolerates
* arbitrary values in those bits rather then masking them off.
*
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 761173ccc33c..6c9e5bdd3916 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -56,6 +56,5 @@
# define __ARCH_WANT_SYS_FORK
# define __ARCH_WANT_SYS_VFORK
# define __ARCH_WANT_SYS_CLONE
-# define __ARCH_WANT_SYS_CLONE3
#endif /* _ASM_X86_UNISTD_H */
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
index d6b17c760622..1876b5edd142 100644
--- a/arch/x86/include/asm/uv/uv_irq.h
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -31,7 +31,6 @@ enum {
UV_AFFINITY_CPU
};
-extern int uv_irq_2_mmr_info(int, unsigned long *, int *);
extern int uv_setup_irq(char *, int, int, unsigned long, int);
extern void uv_teardown_irq(unsigned int);
diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..ff5334ad32a0
--- /dev/null
+++ b/arch/x86/include/asm/vdso/getrandom.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+#ifndef __ASM_VDSO_GETRANDOM_H
+#define __ASM_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm/unistd.h>
+#include <asm/vvar.h>
+
+/**
+ * getrandom_syscall - Invoke the getrandom() syscall.
+ * @buffer: Destination buffer to fill with random bytes.
+ * @len: Size of @buffer in bytes.
+ * @flags: Zero or more GRND_* flags.
+ * Returns: The number of random bytes written to @buffer, or a negative value indicating an error.
+ */
+static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags)
+{
+ long ret;
+
+ asm ("syscall" : "=a" (ret) :
+ "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) :
+ "rcx", "r11", "memory");
+
+ return ret;
+}
+
+#define __vdso_rng_data (VVAR(_vdso_rng_data))
+
+static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void)
+{
+ if (IS_ENABLED(CONFIG_TIME_NS) && __vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS)
+ return (void *)&__vdso_rng_data + ((void *)&__timens_vdso_data - (void *)&__vdso_data);
+ return &__vdso_rng_data;
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index 8e048ca980df..b2d2df026f6e 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -300,7 +300,7 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
#define vdso_cycles_ok arch_vdso_cycles_ok
/*
- * x86 specific delta calculation.
+ * x86 specific calculation of nanoseconds for the current cycle count
*
* The regular implementation assumes that clocksource reads are globally
* monotonic. The TSC can be slightly off across sockets which can cause
@@ -308,8 +308,8 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
* jump.
*
* Therefore it needs to be verified that @cycles are greater than
- * @last. If not then use @last, which is the base time of the current
- * conversion period.
+ * @vd->cycles_last. If not then use @vd->cycles_last, which is the base
+ * time of the current conversion period.
*
* This variant also uses a custom mask because while the clocksource mask of
* all the VDSO capable clocksources on x86 is U64_MAX, the above code uses
@@ -317,25 +317,36 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
* declares everything with the MSB/Sign-bit set as invalid. Therefore the
* effective mask is S64_MAX.
*/
-static __always_inline
-u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 cycles, u64 base)
{
- /*
- * Due to the MSB/Sign-bit being used as invalid marker (see
- * arch_vdso_cycles_valid() above), the effective mask is S64_MAX.
- */
- u64 delta = (cycles - last) & S64_MAX;
+ u64 delta = cycles - vd->cycle_last;
/*
- * Due to the above mentioned TSC wobbles, filter out negative motion.
- * Per the above masking, the effective sign bit is now bit 62.
+ * Negative motion and deltas which can cause multiplication
+ * overflow require special treatment. This check covers both as
+ * negative motion is guaranteed to be greater than @vd::max_cycles
+ * due to unsigned comparison.
+ *
+ * Due to the MSB/Sign-bit being used as invalid marker (see
+ * arch_vdso_cycles_ok() above), the effective mask is S64_MAX, but that
+ * case is also unlikely and will also take the unlikely path here.
*/
- if (unlikely(delta & (1ULL << 62)))
- return 0;
+ if (unlikely(delta > vd->max_cycles)) {
+ /*
+ * Due to the above mentioned TSC wobbles, filter out
+ * negative motion. Per the above masking, the effective
+ * sign bit is now bit 62.
+ */
+ if (delta & (1ULL << 62))
+ return base >> vd->shift;
+
+ /* Handle multiplication overflow gracefully */
+ return mul_u64_u32_add_u64_shr(delta & S64_MAX, vd->mult, base, vd->shift);
+ }
- return delta * mult;
+ return ((delta * vd->mult) + base) >> vd->shift;
}
-#define vdso_calc_delta vdso_calc_delta
+#define vdso_calc_ns vdso_calc_ns
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h
index be199a9b2676..67fedf1698b5 100644
--- a/arch/x86/include/asm/vdso/vsyscall.h
+++ b/arch/x86/include/asm/vdso/vsyscall.h
@@ -4,13 +4,11 @@
#ifndef __ASSEMBLY__
-#include <linux/hrtimer.h>
#include <linux/timekeeper_internal.h>
#include <vdso/datapage.h>
#include <asm/vgtod.h>
#include <asm/vvar.h>
-DEFINE_VVAR(struct vdso_data, _vdso_data);
/*
* Update the vDSO data page to keep in sync with kernel timekeeping.
*/
@@ -21,6 +19,13 @@ struct vdso_data *__x86_get_k_vdso_data(void)
}
#define __arch_get_k_vdso_data __x86_get_k_vdso_data
+static __always_inline
+struct vdso_rng_data *__x86_get_k_vdso_rng_data(void)
+{
+ return &_vdso_rng_data;
+}
+#define __arch_get_k_vdso_rng_data __x86_get_k_vdso_rng_data
+
/* The asm-generic header needs to be included after the definitions above */
#include <asm-generic/vdso/vsyscall.h>
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 7aa38b2ad8a9..a0ce291abcae 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -14,11 +14,6 @@
#include <uapi/linux/time.h>
-#ifdef BUILD_VDSO32_64
-typedef u64 gtod_long_t;
-#else
-typedef unsigned long gtod_long_t;
-#endif
#endif /* CONFIG_GENERIC_GETTIMEOFDAY */
#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/video.h b/arch/x86/include/asm/video.h
new file mode 100644
index 000000000000..0950c9535fae
--- /dev/null
+++ b/arch/x86/include/asm/video.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_VIDEO_H
+#define _ASM_X86_VIDEO_H
+
+#include <linux/types.h>
+
+#include <asm/page.h>
+
+struct device;
+
+pgprot_t pgprot_framebuffer(pgprot_t prot,
+ unsigned long vm_start, unsigned long vm_end,
+ unsigned long offset);
+#define pgprot_framebuffer pgprot_framebuffer
+
+bool video_is_primary_device(struct device *dev);
+#define video_is_primary_device video_is_primary_device
+
+#include <asm-generic/video.h>
+
+#endif /* _ASM_X86_VIDEO_H */
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 9e8ac5073ecb..62ee19909903 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -84,7 +84,7 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
-#define free_vm86(t) do { } while(0)
+#define free_vm86(task) do { (void)(task); } while(0)
#endif /* CONFIG_VM86 */
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
index ac9fc51e2b18..c9cf43d5ef23 100644
--- a/arch/x86/include/asm/vmware.h
+++ b/arch/x86/include/asm/vmware.h
@@ -7,51 +7,321 @@
#include <linux/stringify.h>
/*
- * The hypercall definitions differ in the low word of the %edx argument
- * in the following way: the old port base interface uses the port
- * number to distinguish between high- and low bandwidth versions.
+ * VMware hypercall ABI.
+ *
+ * - Low bandwidth (LB) hypercalls (I/O port based, vmcall and vmmcall)
+ * have up to 6 input and 6 output arguments passed and returned using
+ * registers: %eax (arg0), %ebx (arg1), %ecx (arg2), %edx (arg3),
+ * %esi (arg4), %edi (arg5).
+ * The following input arguments must be initialized by the caller:
+ * arg0 - VMWARE_HYPERVISOR_MAGIC
+ * arg2 - Hypercall command
+ * arg3 bits [15:0] - Port number, LB and direction flags
+ *
+ * - Low bandwidth TDX hypercalls (x86_64 only) are similar to LB
+ * hypercalls. They also have up to 6 input and 6 output on registers
+ * arguments, with different argument to register mapping:
+ * %r12 (arg0), %rbx (arg1), %r13 (arg2), %rdx (arg3),
+ * %rsi (arg4), %rdi (arg5).
+ *
+ * - High bandwidth (HB) hypercalls are I/O port based only. They have
+ * up to 7 input and 7 output arguments passed and returned using
+ * registers: %eax (arg0), %ebx (arg1), %ecx (arg2), %edx (arg3),
+ * %esi (arg4), %edi (arg5), %ebp (arg6).
+ * The following input arguments must be initialized by the caller:
+ * arg0 - VMWARE_HYPERVISOR_MAGIC
+ * arg1 - Hypercall command
+ * arg3 bits [15:0] - Port number, HB and direction flags
+ *
+ * For compatibility purposes, x86_64 systems use only lower 32 bits
+ * for input and output arguments.
+ *
+ * The hypercall definitions differ in the low word of the %edx (arg3)
+ * in the following way: the old I/O port based interface uses the port
+ * number to distinguish between high- and low bandwidth versions, and
+ * uses IN/OUT instructions to define transfer direction.
*
* The new vmcall interface instead uses a set of flags to select
* bandwidth mode and transfer direction. The flags should be loaded
- * into %dx by any user and are automatically replaced by the port
- * number if the VMWARE_HYPERVISOR_PORT method is used.
- *
- * In short, new driver code should strictly use the new definition of
- * %dx content.
+ * into arg3 by any user and are automatically replaced by the port
+ * number if the I/O port method is used.
*/
-/* Old port-based version */
-#define VMWARE_HYPERVISOR_PORT 0x5658
-#define VMWARE_HYPERVISOR_PORT_HB 0x5659
+#define VMWARE_HYPERVISOR_HB BIT(0)
+#define VMWARE_HYPERVISOR_OUT BIT(1)
-/* Current vmcall / vmmcall version */
-#define VMWARE_HYPERVISOR_HB BIT(0)
-#define VMWARE_HYPERVISOR_OUT BIT(1)
+#define VMWARE_HYPERVISOR_PORT 0x5658
+#define VMWARE_HYPERVISOR_PORT_HB (VMWARE_HYPERVISOR_PORT | \
+ VMWARE_HYPERVISOR_HB)
-/* The low bandwidth call. The low word of edx is presumed clear. */
-#define VMWARE_HYPERCALL \
- ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT) ", %%dx; " \
- "inl (%%dx), %%eax", \
- "vmcall", X86_FEATURE_VMCALL, \
- "vmmcall", X86_FEATURE_VMW_VMMCALL)
+#define VMWARE_HYPERVISOR_MAGIC 0x564d5868U
+#define VMWARE_CMD_GETVERSION 10
+#define VMWARE_CMD_GETHZ 45
+#define VMWARE_CMD_GETVCPU_INFO 68
+#define VMWARE_CMD_STEALCLOCK 91
/*
- * The high bandwidth out call. The low word of edx is presumed to have the
- * HB and OUT bits set.
+ * Hypercall command mask:
+ * bits [6:0] command, range [0, 127]
+ * bits [19:16] sub-command, range [0, 15]
*/
-#define VMWARE_HYPERCALL_HB_OUT \
- ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \
- "rep outsb", \
- "vmcall", X86_FEATURE_VMCALL, \
- "vmmcall", X86_FEATURE_VMW_VMMCALL)
+#define VMWARE_CMD_MASK 0xf007fU
+
+#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0)
+#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1)
+
+extern unsigned long vmware_hypercall_slow(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5);
+
+#define VMWARE_TDX_VENDOR_LEAF 0x1af7e4909ULL
+#define VMWARE_TDX_HCALL_FUNC 1
+
+extern unsigned long vmware_tdx_hypercall(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5);
/*
- * The high bandwidth in call. The low word of edx is presumed to have the
- * HB bit set.
+ * The low bandwidth call. The low word of %edx is presumed to have OUT bit
+ * set. The high word of %edx may contain input data from the caller.
*/
-#define VMWARE_HYPERCALL_HB_IN \
- ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \
- "rep insb", \
- "vmcall", X86_FEATURE_VMCALL, \
+#define VMWARE_HYPERCALL \
+ ALTERNATIVE_2("movw %[port], %%dx\n\t" \
+ "inl (%%dx), %%eax", \
+ "vmcall", X86_FEATURE_VMCALL, \
"vmmcall", X86_FEATURE_VMW_VMMCALL)
+
+static inline
+unsigned long vmware_hypercall1(unsigned long cmd, unsigned long in1)
+{
+ unsigned long out0;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return vmware_tdx_hypercall(cmd, in1, 0, 0, 0,
+ NULL, NULL, NULL, NULL, NULL);
+
+ if (unlikely(!alternatives_patched) && !__is_defined(MODULE))
+ return vmware_hypercall_slow(cmd, in1, 0, 0, 0,
+ NULL, NULL, NULL, NULL, NULL);
+
+ asm_inline volatile (VMWARE_HYPERCALL
+ : "=a" (out0)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (0)
+ : "cc", "memory");
+ return out0;
+}
+
+static inline
+unsigned long vmware_hypercall3(unsigned long cmd, unsigned long in1,
+ u32 *out1, u32 *out2)
+{
+ unsigned long out0;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return vmware_tdx_hypercall(cmd, in1, 0, 0, 0,
+ out1, out2, NULL, NULL, NULL);
+
+ if (unlikely(!alternatives_patched) && !__is_defined(MODULE))
+ return vmware_hypercall_slow(cmd, in1, 0, 0, 0,
+ out1, out2, NULL, NULL, NULL);
+
+ asm_inline volatile (VMWARE_HYPERCALL
+ : "=a" (out0), "=b" (*out1), "=c" (*out2)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (0)
+ : "cc", "memory");
+ return out0;
+}
+
+static inline
+unsigned long vmware_hypercall4(unsigned long cmd, unsigned long in1,
+ u32 *out1, u32 *out2, u32 *out3)
+{
+ unsigned long out0;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return vmware_tdx_hypercall(cmd, in1, 0, 0, 0,
+ out1, out2, out3, NULL, NULL);
+
+ if (unlikely(!alternatives_patched) && !__is_defined(MODULE))
+ return vmware_hypercall_slow(cmd, in1, 0, 0, 0,
+ out1, out2, out3, NULL, NULL);
+
+ asm_inline volatile (VMWARE_HYPERCALL
+ : "=a" (out0), "=b" (*out1), "=c" (*out2), "=d" (*out3)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (0)
+ : "cc", "memory");
+ return out0;
+}
+
+static inline
+unsigned long vmware_hypercall5(unsigned long cmd, unsigned long in1,
+ unsigned long in3, unsigned long in4,
+ unsigned long in5, u32 *out2)
+{
+ unsigned long out0;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return vmware_tdx_hypercall(cmd, in1, in3, in4, in5,
+ NULL, out2, NULL, NULL, NULL);
+
+ if (unlikely(!alternatives_patched) && !__is_defined(MODULE))
+ return vmware_hypercall_slow(cmd, in1, in3, in4, in5,
+ NULL, out2, NULL, NULL, NULL);
+
+ asm_inline volatile (VMWARE_HYPERCALL
+ : "=a" (out0), "=c" (*out2)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ return out0;
+}
+
+static inline
+unsigned long vmware_hypercall6(unsigned long cmd, unsigned long in1,
+ unsigned long in3, u32 *out2,
+ u32 *out3, u32 *out4, u32 *out5)
+{
+ unsigned long out0;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return vmware_tdx_hypercall(cmd, in1, in3, 0, 0,
+ NULL, out2, out3, out4, out5);
+
+ if (unlikely(!alternatives_patched) && !__is_defined(MODULE))
+ return vmware_hypercall_slow(cmd, in1, in3, 0, 0,
+ NULL, out2, out3, out4, out5);
+
+ asm_inline volatile (VMWARE_HYPERCALL
+ : "=a" (out0), "=c" (*out2), "=d" (*out3), "=S" (*out4),
+ "=D" (*out5)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3)
+ : "cc", "memory");
+ return out0;
+}
+
+static inline
+unsigned long vmware_hypercall7(unsigned long cmd, unsigned long in1,
+ unsigned long in3, unsigned long in4,
+ unsigned long in5, u32 *out1,
+ u32 *out2, u32 *out3)
+{
+ unsigned long out0;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return vmware_tdx_hypercall(cmd, in1, in3, in4, in5,
+ out1, out2, out3, NULL, NULL);
+
+ if (unlikely(!alternatives_patched) && !__is_defined(MODULE))
+ return vmware_hypercall_slow(cmd, in1, in3, in4, in5,
+ out1, out2, out3, NULL, NULL);
+
+ asm_inline volatile (VMWARE_HYPERCALL
+ : "=a" (out0), "=b" (*out1), "=c" (*out2), "=d" (*out3)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ return out0;
+}
+
+#ifdef CONFIG_X86_64
+#define VMW_BP_CONSTRAINT "r"
+#else
+#define VMW_BP_CONSTRAINT "m"
+#endif
+
+/*
+ * High bandwidth calls are not supported on encrypted memory guests.
+ * The caller should check cc_platform_has(CC_ATTR_MEM_ENCRYPT) and use
+ * low bandwidth hypercall if memory encryption is set.
+ * This assumption simplifies HB hypercall implementation to just I/O port
+ * based approach without alternative patching.
+ */
+static inline
+unsigned long vmware_hypercall_hb_out(unsigned long cmd, unsigned long in2,
+ unsigned long in3, unsigned long in4,
+ unsigned long in5, unsigned long in6,
+ u32 *out1)
+{
+ unsigned long out0;
+
+ asm_inline volatile (
+ UNWIND_HINT_SAVE
+ "push %%" _ASM_BP "\n\t"
+ UNWIND_HINT_UNDEFINED
+ "mov %[in6], %%" _ASM_BP "\n\t"
+ "rep outsb\n\t"
+ "pop %%" _ASM_BP "\n\t"
+ UNWIND_HINT_RESTORE
+ : "=a" (out0), "=b" (*out1)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (cmd),
+ "c" (in2),
+ "d" (in3 | VMWARE_HYPERVISOR_PORT_HB),
+ "S" (in4),
+ "D" (in5),
+ [in6] VMW_BP_CONSTRAINT (in6)
+ : "cc", "memory");
+ return out0;
+}
+
+static inline
+unsigned long vmware_hypercall_hb_in(unsigned long cmd, unsigned long in2,
+ unsigned long in3, unsigned long in4,
+ unsigned long in5, unsigned long in6,
+ u32 *out1)
+{
+ unsigned long out0;
+
+ asm_inline volatile (
+ UNWIND_HINT_SAVE
+ "push %%" _ASM_BP "\n\t"
+ UNWIND_HINT_UNDEFINED
+ "mov %[in6], %%" _ASM_BP "\n\t"
+ "rep insb\n\t"
+ "pop %%" _ASM_BP "\n\t"
+ UNWIND_HINT_RESTORE
+ : "=a" (out0), "=b" (*out1)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (cmd),
+ "c" (in2),
+ "d" (in3 | VMWARE_HYPERVISOR_PORT_HB),
+ "S" (in4),
+ "D" (in5),
+ [in6] VMW_BP_CONSTRAINT (in6)
+ : "cc", "memory");
+ return out0;
+}
+#undef VMW_BP_CONSTRAINT
+#undef VMWARE_HYPERCALL
+
#endif
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 4dba17363008..f7fd4369b821 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -71,6 +71,7 @@
#define SECONDARY_EXEC_ENCLS_EXITING VMCS_CONTROL_BIT(ENCLS_EXITING)
#define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING)
#define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
+#define SECONDARY_EXEC_EPT_VIOLATION_VE VMCS_CONTROL_BIT(EPT_VIOLATION_VE)
#define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES)
#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
@@ -121,19 +122,17 @@
#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff
-#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
-#define VMX_MISC_SAVE_EFER_LMA 0x00000020
-#define VMX_MISC_ACTIVITY_HLT 0x00000040
-#define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100
-#define VMX_MISC_ZERO_LEN_INS 0x40000000
-#define VMX_MISC_MSR_LIST_MULTIPLIER 512
-
/* VMFUNC functions */
#define VMFUNC_CONTROL_BIT(x) BIT((VMX_FEATURE_##x & 0x1f) - 28)
#define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING)
#define VMFUNC_EPTP_ENTRIES 512
+#define VMX_BASIC_32BIT_PHYS_ADDR_ONLY BIT_ULL(48)
+#define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49)
+#define VMX_BASIC_INOUT BIT_ULL(54)
+#define VMX_BASIC_TRUE_CTLS BIT_ULL(55)
+
static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
{
return vmx_basic & GENMASK_ULL(30, 0);
@@ -144,9 +143,30 @@ static inline u32 vmx_basic_vmcs_size(u64 vmx_basic)
return (vmx_basic & GENMASK_ULL(44, 32)) >> 32;
}
+static inline u32 vmx_basic_vmcs_mem_type(u64 vmx_basic)
+{
+ return (vmx_basic & GENMASK_ULL(53, 50)) >> 50;
+}
+
+static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype)
+{
+ return revision | ((u64)size << 32) | ((u64)memtype << 50);
+}
+
+#define VMX_MISC_SAVE_EFER_LMA BIT_ULL(5)
+#define VMX_MISC_ACTIVITY_HLT BIT_ULL(6)
+#define VMX_MISC_ACTIVITY_SHUTDOWN BIT_ULL(7)
+#define VMX_MISC_ACTIVITY_WAIT_SIPI BIT_ULL(8)
+#define VMX_MISC_INTEL_PT BIT_ULL(14)
+#define VMX_MISC_RDMSR_IN_SMM BIT_ULL(15)
+#define VMX_MISC_VMXOFF_BLOCK_SMI BIT_ULL(28)
+#define VMX_MISC_VMWRITE_SHADOW_RO_FIELDS BIT_ULL(29)
+#define VMX_MISC_ZERO_LEN_INS BIT_ULL(30)
+#define VMX_MISC_MSR_LIST_MULTIPLIER 512
+
static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc)
{
- return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ return vmx_misc & GENMASK_ULL(4, 0);
}
static inline int vmx_misc_cr3_count(u64 vmx_misc)
@@ -226,6 +246,8 @@ enum vmcs_field {
VMREAD_BITMAP_HIGH = 0x00002027,
VMWRITE_BITMAP = 0x00002028,
VMWRITE_BITMAP_HIGH = 0x00002029,
+ VE_INFORMATION_ADDRESS = 0x0000202A,
+ VE_INFORMATION_ADDRESS_HIGH = 0x0000202B,
XSS_EXIT_BITMAP = 0x0000202C,
XSS_EXIT_BITMAP_HIGH = 0x0000202D,
ENCLS_EXITING_BITMAP = 0x0000202E,
@@ -505,15 +527,17 @@ enum vmcs_field {
#define VMX_EPTP_PWL_4 0x18ull
#define VMX_EPTP_PWL_5 0x20ull
#define VMX_EPTP_AD_ENABLE_BIT (1ull << 6)
+/* The EPTP memtype is encoded in bits 2:0, i.e. doesn't need to be shifted. */
#define VMX_EPTP_MT_MASK 0x7ull
-#define VMX_EPTP_MT_WB 0x6ull
-#define VMX_EPTP_MT_UC 0x0ull
+#define VMX_EPTP_MT_WB X86_MEMTYPE_WB
+#define VMX_EPTP_MT_UC X86_MEMTYPE_UC
#define VMX_EPT_READABLE_MASK 0x1ull
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
#define VMX_EPT_IPAT_BIT (1ull << 6)
#define VMX_EPT_ACCESS_BIT (1ull << 8)
#define VMX_EPT_DIRTY_BIT (1ull << 9)
+#define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63)
#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \
VMX_EPT_WRITABLE_MASK | \
VMX_EPT_EXECUTABLE_MASK)
@@ -630,4 +654,13 @@ enum vmx_l1d_flush_state {
extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
+struct vmx_ve_information {
+ u32 exit_reason;
+ u32 delivery;
+ u64 exit_qualification;
+ u64 guest_linear_address;
+ u64 guest_physical_address;
+ u16 eptp_index;
+};
+
#endif
diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
index 266daf5b5b84..09b1d7e607c1 100644
--- a/arch/x86/include/asm/vmxfeatures.h
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -9,85 +9,85 @@
/*
* Note: If the comment begins with a quoted string, that string is used
- * in /proc/cpuinfo instead of the macro name. If the string is "",
- * this feature bit is not displayed in /proc/cpuinfo at all.
+ * in /proc/cpuinfo instead of the macro name. Otherwise, this feature bit
+ * is not displayed in /proc/cpuinfo at all.
*/
/* Pin-Based VM-Execution Controls, EPT/VPID, APIC and VM-Functions, word 0 */
-#define VMX_FEATURE_INTR_EXITING ( 0*32+ 0) /* "" VM-Exit on vectored interrupts */
-#define VMX_FEATURE_NMI_EXITING ( 0*32+ 3) /* "" VM-Exit on NMIs */
+#define VMX_FEATURE_INTR_EXITING ( 0*32+ 0) /* VM-Exit on vectored interrupts */
+#define VMX_FEATURE_NMI_EXITING ( 0*32+ 3) /* VM-Exit on NMIs */
#define VMX_FEATURE_VIRTUAL_NMIS ( 0*32+ 5) /* "vnmi" NMI virtualization */
-#define VMX_FEATURE_PREEMPTION_TIMER ( 0*32+ 6) /* VMX Preemption Timer */
-#define VMX_FEATURE_POSTED_INTR ( 0*32+ 7) /* Posted Interrupts */
+#define VMX_FEATURE_PREEMPTION_TIMER ( 0*32+ 6) /* "preemption_timer" VMX Preemption Timer */
+#define VMX_FEATURE_POSTED_INTR ( 0*32+ 7) /* "posted_intr" Posted Interrupts */
/* EPT/VPID features, scattered to bits 16-23 */
-#define VMX_FEATURE_INVVPID ( 0*32+ 16) /* INVVPID is supported */
+#define VMX_FEATURE_INVVPID ( 0*32+ 16) /* "invvpid" INVVPID is supported */
#define VMX_FEATURE_EPT_EXECUTE_ONLY ( 0*32+ 17) /* "ept_x_only" EPT entries can be execute only */
-#define VMX_FEATURE_EPT_AD ( 0*32+ 18) /* EPT Accessed/Dirty bits */
-#define VMX_FEATURE_EPT_1GB ( 0*32+ 19) /* 1GB EPT pages */
-#define VMX_FEATURE_EPT_5LEVEL ( 0*32+ 20) /* 5-level EPT paging */
+#define VMX_FEATURE_EPT_AD ( 0*32+ 18) /* "ept_ad" EPT Accessed/Dirty bits */
+#define VMX_FEATURE_EPT_1GB ( 0*32+ 19) /* "ept_1gb" 1GB EPT pages */
+#define VMX_FEATURE_EPT_5LEVEL ( 0*32+ 20) /* "ept_5level" 5-level EPT paging */
/* Aggregated APIC features 24-27 */
-#define VMX_FEATURE_FLEXPRIORITY ( 0*32+ 24) /* TPR shadow + virt APIC */
-#define VMX_FEATURE_APICV ( 0*32+ 25) /* TPR shadow + APIC reg virt + virt intr delivery + posted interrupts */
+#define VMX_FEATURE_FLEXPRIORITY ( 0*32+ 24) /* "flexpriority" TPR shadow + virt APIC */
+#define VMX_FEATURE_APICV ( 0*32+ 25) /* "apicv" TPR shadow + APIC reg virt + virt intr delivery + posted interrupts */
/* VM-Functions, shifted to bits 28-31 */
-#define VMX_FEATURE_EPTP_SWITCHING ( 0*32+ 28) /* EPTP switching (in guest) */
+#define VMX_FEATURE_EPTP_SWITCHING ( 0*32+ 28) /* "eptp_switching" EPTP switching (in guest) */
/* Primary Processor-Based VM-Execution Controls, word 1 */
-#define VMX_FEATURE_INTR_WINDOW_EXITING ( 1*32+ 2) /* "" VM-Exit if INTRs are unblocked in guest */
+#define VMX_FEATURE_INTR_WINDOW_EXITING ( 1*32+ 2) /* VM-Exit if INTRs are unblocked in guest */
#define VMX_FEATURE_USE_TSC_OFFSETTING ( 1*32+ 3) /* "tsc_offset" Offset hardware TSC when read in guest */
-#define VMX_FEATURE_HLT_EXITING ( 1*32+ 7) /* "" VM-Exit on HLT */
-#define VMX_FEATURE_INVLPG_EXITING ( 1*32+ 9) /* "" VM-Exit on INVLPG */
-#define VMX_FEATURE_MWAIT_EXITING ( 1*32+ 10) /* "" VM-Exit on MWAIT */
-#define VMX_FEATURE_RDPMC_EXITING ( 1*32+ 11) /* "" VM-Exit on RDPMC */
-#define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* "" VM-Exit on RDTSC */
-#define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* "" VM-Exit on writes to CR3 */
-#define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* "" VM-Exit on reads from CR3 */
-#define VMX_FEATURE_TERTIARY_CONTROLS ( 1*32+ 17) /* "" Enable Tertiary VM-Execution Controls */
-#define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* "" VM-Exit on writes to CR8 */
-#define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* "" VM-Exit on reads from CR8 */
+#define VMX_FEATURE_HLT_EXITING ( 1*32+ 7) /* VM-Exit on HLT */
+#define VMX_FEATURE_INVLPG_EXITING ( 1*32+ 9) /* VM-Exit on INVLPG */
+#define VMX_FEATURE_MWAIT_EXITING ( 1*32+ 10) /* VM-Exit on MWAIT */
+#define VMX_FEATURE_RDPMC_EXITING ( 1*32+ 11) /* VM-Exit on RDPMC */
+#define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* VM-Exit on RDTSC */
+#define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* VM-Exit on writes to CR3 */
+#define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* VM-Exit on reads from CR3 */
+#define VMX_FEATURE_TERTIARY_CONTROLS ( 1*32+ 17) /* Enable Tertiary VM-Execution Controls */
+#define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* VM-Exit on writes to CR8 */
+#define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* VM-Exit on reads from CR8 */
#define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */
-#define VMX_FEATURE_NMI_WINDOW_EXITING ( 1*32+ 22) /* "" VM-Exit if NMIs are unblocked in guest */
-#define VMX_FEATURE_MOV_DR_EXITING ( 1*32+ 23) /* "" VM-Exit on accesses to debug registers */
-#define VMX_FEATURE_UNCOND_IO_EXITING ( 1*32+ 24) /* "" VM-Exit on *all* IN{S} and OUT{S}*/
-#define VMX_FEATURE_USE_IO_BITMAPS ( 1*32+ 25) /* "" VM-Exit based on I/O port */
+#define VMX_FEATURE_NMI_WINDOW_EXITING ( 1*32+ 22) /* VM-Exit if NMIs are unblocked in guest */
+#define VMX_FEATURE_MOV_DR_EXITING ( 1*32+ 23) /* VM-Exit on accesses to debug registers */
+#define VMX_FEATURE_UNCOND_IO_EXITING ( 1*32+ 24) /* VM-Exit on *all* IN{S} and OUT{S}*/
+#define VMX_FEATURE_USE_IO_BITMAPS ( 1*32+ 25) /* VM-Exit based on I/O port */
#define VMX_FEATURE_MONITOR_TRAP_FLAG ( 1*32+ 27) /* "mtf" VMX single-step VM-Exits */
-#define VMX_FEATURE_USE_MSR_BITMAPS ( 1*32+ 28) /* "" VM-Exit based on MSR index */
-#define VMX_FEATURE_MONITOR_EXITING ( 1*32+ 29) /* "" VM-Exit on MONITOR (MWAIT's accomplice) */
-#define VMX_FEATURE_PAUSE_EXITING ( 1*32+ 30) /* "" VM-Exit on PAUSE (unconditionally) */
-#define VMX_FEATURE_SEC_CONTROLS ( 1*32+ 31) /* "" Enable Secondary VM-Execution Controls */
+#define VMX_FEATURE_USE_MSR_BITMAPS ( 1*32+ 28) /* VM-Exit based on MSR index */
+#define VMX_FEATURE_MONITOR_EXITING ( 1*32+ 29) /* VM-Exit on MONITOR (MWAIT's accomplice) */
+#define VMX_FEATURE_PAUSE_EXITING ( 1*32+ 30) /* VM-Exit on PAUSE (unconditionally) */
+#define VMX_FEATURE_SEC_CONTROLS ( 1*32+ 31) /* Enable Secondary VM-Execution Controls */
/* Secondary Processor-Based VM-Execution Controls, word 2 */
#define VMX_FEATURE_VIRT_APIC_ACCESSES ( 2*32+ 0) /* "vapic" Virtualize memory mapped APIC accesses */
-#define VMX_FEATURE_EPT ( 2*32+ 1) /* Extended Page Tables, a.k.a. Two-Dimensional Paging */
-#define VMX_FEATURE_DESC_EXITING ( 2*32+ 2) /* "" VM-Exit on {S,L}*DT instructions */
-#define VMX_FEATURE_RDTSCP ( 2*32+ 3) /* "" Enable RDTSCP in guest */
-#define VMX_FEATURE_VIRTUAL_X2APIC ( 2*32+ 4) /* "" Virtualize X2APIC for the guest */
-#define VMX_FEATURE_VPID ( 2*32+ 5) /* Virtual Processor ID (TLB ASID modifier) */
-#define VMX_FEATURE_WBINVD_EXITING ( 2*32+ 6) /* "" VM-Exit on WBINVD */
-#define VMX_FEATURE_UNRESTRICTED_GUEST ( 2*32+ 7) /* Allow Big Real Mode and other "invalid" states */
+#define VMX_FEATURE_EPT ( 2*32+ 1) /* "ept" Extended Page Tables, a.k.a. Two-Dimensional Paging */
+#define VMX_FEATURE_DESC_EXITING ( 2*32+ 2) /* VM-Exit on {S,L}*DT instructions */
+#define VMX_FEATURE_RDTSCP ( 2*32+ 3) /* Enable RDTSCP in guest */
+#define VMX_FEATURE_VIRTUAL_X2APIC ( 2*32+ 4) /* Virtualize X2APIC for the guest */
+#define VMX_FEATURE_VPID ( 2*32+ 5) /* "vpid" Virtual Processor ID (TLB ASID modifier) */
+#define VMX_FEATURE_WBINVD_EXITING ( 2*32+ 6) /* VM-Exit on WBINVD */
+#define VMX_FEATURE_UNRESTRICTED_GUEST ( 2*32+ 7) /* "unrestricted_guest" Allow Big Real Mode and other "invalid" states */
#define VMX_FEATURE_APIC_REGISTER_VIRT ( 2*32+ 8) /* "vapic_reg" Hardware emulation of reads to the virtual-APIC */
#define VMX_FEATURE_VIRT_INTR_DELIVERY ( 2*32+ 9) /* "vid" Evaluation and delivery of pending virtual interrupts */
#define VMX_FEATURE_PAUSE_LOOP_EXITING ( 2*32+ 10) /* "ple" Conditionally VM-Exit on PAUSE at CPL0 */
-#define VMX_FEATURE_RDRAND_EXITING ( 2*32+ 11) /* "" VM-Exit on RDRAND*/
-#define VMX_FEATURE_INVPCID ( 2*32+ 12) /* "" Enable INVPCID in guest */
-#define VMX_FEATURE_VMFUNC ( 2*32+ 13) /* "" Enable VM-Functions (leaf dependent) */
-#define VMX_FEATURE_SHADOW_VMCS ( 2*32+ 14) /* VMREAD/VMWRITE in guest can access shadow VMCS */
-#define VMX_FEATURE_ENCLS_EXITING ( 2*32+ 15) /* "" VM-Exit on ENCLS (leaf dependent) */
-#define VMX_FEATURE_RDSEED_EXITING ( 2*32+ 16) /* "" VM-Exit on RDSEED */
+#define VMX_FEATURE_RDRAND_EXITING ( 2*32+ 11) /* VM-Exit on RDRAND*/
+#define VMX_FEATURE_INVPCID ( 2*32+ 12) /* Enable INVPCID in guest */
+#define VMX_FEATURE_VMFUNC ( 2*32+ 13) /* Enable VM-Functions (leaf dependent) */
+#define VMX_FEATURE_SHADOW_VMCS ( 2*32+ 14) /* "shadow_vmcs" VMREAD/VMWRITE in guest can access shadow VMCS */
+#define VMX_FEATURE_ENCLS_EXITING ( 2*32+ 15) /* VM-Exit on ENCLS (leaf dependent) */
+#define VMX_FEATURE_RDSEED_EXITING ( 2*32+ 16) /* VM-Exit on RDSEED */
#define VMX_FEATURE_PAGE_MOD_LOGGING ( 2*32+ 17) /* "pml" Log dirty pages into buffer */
-#define VMX_FEATURE_EPT_VIOLATION_VE ( 2*32+ 18) /* "" Conditionally reflect EPT violations as #VE exceptions */
-#define VMX_FEATURE_PT_CONCEAL_VMX ( 2*32+ 19) /* "" Suppress VMX indicators in Processor Trace */
-#define VMX_FEATURE_XSAVES ( 2*32+ 20) /* "" Enable XSAVES and XRSTORS in guest */
+#define VMX_FEATURE_EPT_VIOLATION_VE ( 2*32+ 18) /* "ept_violation_ve" Conditionally reflect EPT violations as #VE exceptions */
+#define VMX_FEATURE_PT_CONCEAL_VMX ( 2*32+ 19) /* Suppress VMX indicators in Processor Trace */
+#define VMX_FEATURE_XSAVES ( 2*32+ 20) /* Enable XSAVES and XRSTORS in guest */
#define VMX_FEATURE_MODE_BASED_EPT_EXEC ( 2*32+ 22) /* "ept_mode_based_exec" Enable separate EPT EXEC bits for supervisor vs. user */
-#define VMX_FEATURE_PT_USE_GPA ( 2*32+ 24) /* "" Processor Trace logs GPAs */
-#define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* Scale hardware TSC when read in guest */
-#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
-#define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
-#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */
-#define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* VM-Exit when no event windows after notify window */
+#define VMX_FEATURE_PT_USE_GPA ( 2*32+ 24) /* Processor Trace logs GPAs */
+#define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* "tsc_scaling" Scale hardware TSC when read in guest */
+#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* "usr_wait_pause" Enable TPAUSE, UMONITOR, UMWAIT in guest */
+#define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* VM-Exit on ENCLV (leaf dependent) */
+#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* VM-Exit when bus lock caused */
+#define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* "notify_vm_exiting" VM-Exit when no event windows after notify window */
/* Tertiary Processor-Based VM-Execution Controls, word 3 */
-#define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* Enable IPI virtualization */
+#define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* "ipi_virt" Enable IPI virtualization */
#endif /* _ASM_X86_VMXFEATURES_H */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 183e98e49ab9..9d9af37f7cab 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -26,6 +26,8 @@
*/
#define DECLARE_VVAR(offset, type, name) \
EMIT_VVAR(name, offset)
+#define DECLARE_VVAR_SINGLE(offset, type, name) \
+ EMIT_VVAR(name, offset)
#else
@@ -37,6 +39,10 @@ extern char __vvar_page;
extern type timens_ ## name[CS_BASES] \
__attribute__((visibility("hidden"))); \
+#define DECLARE_VVAR_SINGLE(offset, type, name) \
+ extern type vvar_ ## name \
+ __attribute__((visibility("hidden"))); \
+
#define VVAR(name) (vvar_ ## name)
#define TIMENS(name) (timens_ ## name)
@@ -44,12 +50,22 @@ extern char __vvar_page;
type name[CS_BASES] \
__attribute__((section(".vvar_" #name), aligned(16))) __visible
+#define DEFINE_VVAR_SINGLE(type, name) \
+ type name \
+ __attribute__((section(".vvar_" #name), aligned(16))) __visible
+
#endif
/* DECLARE_VVAR(offset, type, name) */
DECLARE_VVAR(128, struct vdso_data, _vdso_data)
+#if !defined(_SINGLE_DATA)
+#define _SINGLE_DATA
+DECLARE_VVAR_SINGLE(640, struct vdso_rng_data, _vdso_rng_data)
+#endif
+
#undef DECLARE_VVAR
+#undef DECLARE_VVAR_SINGLE
#endif
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
index e8d7d4941c4c..422a47746657 100644
--- a/arch/x86/include/asm/word-at-a-time.h
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -5,45 +5,12 @@
#include <linux/bitops.h>
#include <linux/wordpart.h>
-/*
- * This is largely generic for little-endian machines, but the
- * optimal byte mask counting is probably going to be something
- * that is architecture-specific. If you have a reliably fast
- * bit count instruction, that might be better than the multiply
- * and shift, for example.
- */
struct word_at_a_time {
const unsigned long one_bits, high_bits;
};
#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
-#ifdef CONFIG_64BIT
-
-/*
- * Jan Achrenius on G+: microoptimized version of
- * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
- * that works for the bytemasks without having to
- * mask them first.
- */
-static inline long count_masked_bytes(unsigned long mask)
-{
- return mask*0x0001020304050608ul >> 56;
-}
-
-#else /* 32-bit case */
-
-/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
-static inline long count_masked_bytes(long mask)
-{
- /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
- long a = (0x0ff0001+mask) >> 23;
- /* Fix the 1 for 00 case */
- return a & mask;
-}
-
-#endif
-
/* Return nonzero if it has a zero */
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
{
@@ -57,6 +24,22 @@ static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits,
return bits;
}
+#ifdef CONFIG_64BIT
+
+/* Keep the initial has_zero() value for both bitmask and size calc */
+#define create_zero_mask(bits) (bits)
+
+static inline unsigned long zero_bytemask(unsigned long bits)
+{
+ bits = (bits - 1) & ~bits;
+ return bits >> 7;
+}
+
+#define find_zero(bits) (__ffs(bits) >> 3)
+
+#else
+
+/* Create the final mask for both bytemask and size */
static inline unsigned long create_zero_mask(unsigned long bits)
{
bits = (bits - 1) & ~bits;
@@ -66,11 +49,17 @@ static inline unsigned long create_zero_mask(unsigned long bits)
/* The mask we created is directly usable as a bytemask */
#define zero_bytemask(mask) (mask)
+/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
static inline unsigned long find_zero(unsigned long mask)
{
- return count_masked_bytes(mask);
+ /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
+ long a = (0x0ff0001+mask) >> 23;
+ /* Fix the 1 for 00 case */
+ return a & mask;
}
+#endif
+
/*
* Load an unaligned word from kernel space.
*
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 6149eabe200f..213cf5379a5a 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -149,12 +149,22 @@ struct x86_init_acpi {
* @enc_status_change_finish Notify HV after the encryption status of a range is changed
* @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status
* @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status
+ * @enc_kexec_begin Begin the two-step process of converting shared memory back
+ * to private. It stops the new conversions from being started
+ * and waits in-flight conversions to finish, if possible.
+ * @enc_kexec_finish Finish the two-step process of converting shared memory to
+ * private. All memory is private after the call when
+ * the function returns.
+ * It is called on only one CPU while the others are shut down
+ * and with interrupts disabled.
*/
struct x86_guest {
- bool (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
- bool (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
+ int (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc);
+ int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc);
bool (*enc_tlb_flush_required)(bool enc);
bool (*enc_cache_flush_required)(void);
+ void (*enc_kexec_begin)(void);
+ void (*enc_kexec_finish)(void);
};
/**
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 64fbd2dbc5b7..a9088250770f 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -62,11 +62,6 @@ void xen_arch_unregister_cpu(int num);
#ifdef CONFIG_PVH
void __init xen_pvh_init(struct boot_params *boot_params);
void __init mem_map_via_hcall(struct boot_params *boot_params_p);
-#ifdef CONFIG_XEN_PVH
-void __init xen_reserve_extra_memory(struct boot_params *bootp);
-#else
-static inline void xen_reserve_extra_memory(struct boot_params *bootp) { }
-#endif
#endif
/* Lazy mode for batching updates / context switch */
diff --git a/arch/x86/include/uapi/asm/elf.h b/arch/x86/include/uapi/asm/elf.h
new file mode 100644
index 000000000000..468e135fa285
--- /dev/null
+++ b/arch/x86/include/uapi/asm/elf.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_ELF_H
+#define _UAPI_ASM_X86_ELF_H
+
+#include <linux/types.h>
+
+struct x86_xfeat_component {
+ __u32 type;
+ __u32 size;
+ __u32 offset;
+ __u32 flags;
+} __packed;
+
+_Static_assert(sizeof(struct x86_xfeat_component) % 4 == 0, "x86_xfeat_component is not aligned");
+
+#endif /* _UAPI_ASM_X86_ELF_H */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index ef11aa4cab42..a8debbf2f702 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -106,6 +106,7 @@ struct kvm_ioapic_state {
#define KVM_RUN_X86_SMM (1 << 0)
#define KVM_RUN_X86_BUS_LOCK (1 << 1)
+#define KVM_RUN_X86_GUEST_MODE (1 << 2)
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
@@ -438,6 +439,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
+#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
@@ -457,8 +459,13 @@ struct kvm_sync_regs {
#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
-/* attributes for system fd (group 0) */
-#define KVM_X86_XCOMP_GUEST_SUPP 0
+/* vendor-independent attributes for system fd (group 0) */
+#define KVM_X86_GRP_SYSTEM 0
+# define KVM_X86_XCOMP_GUEST_SUPP 0
+
+/* vendor-specific groups and attributes for system fd */
+#define KVM_X86_GRP_SEV 1
+# define KVM_X86_SEV_VMSA_FEATURES 0
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
@@ -689,6 +696,14 @@ enum sev_cmd_id {
/* Guest Migration Extension */
KVM_SEV_SEND_CANCEL,
+ /* Second time is the charm; improved versions of the above ioctls. */
+ KVM_SEV_INIT2,
+
+ /* SNP-specific commands */
+ KVM_SEV_SNP_LAUNCH_START = 100,
+ KVM_SEV_SNP_LAUNCH_UPDATE,
+ KVM_SEV_SNP_LAUNCH_FINISH,
+
KVM_SEV_NR_MAX,
};
@@ -700,6 +715,14 @@ struct kvm_sev_cmd {
__u32 sev_fd;
};
+struct kvm_sev_init {
+ __u64 vmsa_features;
+ __u32 flags;
+ __u16 ghcb_version;
+ __u16 pad1;
+ __u32 pad2[8];
+};
+
struct kvm_sev_launch_start {
__u32 handle;
__u32 policy;
@@ -808,6 +831,48 @@ struct kvm_sev_receive_update_data {
__u32 pad2;
};
+struct kvm_sev_snp_launch_start {
+ __u64 policy;
+ __u8 gosvw[16];
+ __u16 flags;
+ __u8 pad0[6];
+ __u64 pad1[4];
+};
+
+/* Kept in sync with firmware values for simplicity. */
+#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1
+#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3
+#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4
+#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5
+#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6
+
+struct kvm_sev_snp_launch_update {
+ __u64 gfn_start;
+ __u64 uaddr;
+ __u64 len;
+ __u8 type;
+ __u8 pad0;
+ __u16 flags;
+ __u32 pad1;
+ __u64 pad2[4];
+};
+
+#define KVM_SEV_SNP_ID_BLOCK_SIZE 96
+#define KVM_SEV_SNP_ID_AUTH_SIZE 4096
+#define KVM_SEV_SNP_FINISH_DATA_SIZE 32
+
+struct kvm_sev_snp_launch_finish {
+ __u64 id_block_uaddr;
+ __u64 id_auth_uaddr;
+ __u8 id_block_en;
+ __u8 auth_key_en;
+ __u8 vcek_disabled;
+ __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE];
+ __u8 pad0[3];
+ __u16 flags;
+ __u64 pad1[4];
+};
+
#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
@@ -856,5 +921,8 @@ struct kvm_hyperv_eventfd {
#define KVM_X86_DEFAULT_VM 0
#define KVM_X86_SW_PROTECTED_VM 1
+#define KVM_X86_SEV_VM 2
+#define KVM_X86_SEV_ES_VM 3
+#define KVM_X86_SNP_VM 4
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 80e1df482337..1814b413fd57 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -115,6 +115,7 @@
#define SVM_VMGEXIT_AP_CREATE_ON_INIT 0
#define SVM_VMGEXIT_AP_CREATE 1
#define SVM_VMGEXIT_AP_DESTROY 2
+#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018
#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd
#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe
#define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 74077694da7d..f7918980667a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -17,7 +17,6 @@ CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg
CFLAGS_REMOVE_head32.o = -pg
-CFLAGS_REMOVE_sev.o = -pg
CFLAGS_REMOVE_rethook.o = -pg
endif
@@ -26,21 +25,26 @@ KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_paravirt.o := n
-KASAN_SANITIZE_sev.o := n
# With some compiler versions the generated code results in boot hangs, caused
# by several compilation units. To be safe, disable all instrumentation.
KCSAN_SANITIZE := n
KMSAN_SANITIZE_head$(BITS).o := n
KMSAN_SANITIZE_nmi.o := n
-KMSAN_SANITIZE_sev.o := n
# If instrumentation of the following files is enabled, boot hangs during
# first second.
KCOV_INSTRUMENT_head$(BITS).o := n
-KCOV_INSTRUMENT_sev.o := n
+# These are called from save_stack_trace() on debug paths,
+# and produce large amounts of uninteresting coverage.
+KCOV_INSTRUMENT_stacktrace.o := n
+KCOV_INSTRUMENT_dumpstack.o := n
+KCOV_INSTRUMENT_dumpstack_$(BITS).o := n
+KCOV_INSTRUMENT_unwind_orc.o := n
+KCOV_INSTRUMENT_unwind_frame.o := n
+KCOV_INSTRUMENT_unwind_guess.o := n
-CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace
+CFLAGS_irq.o := -I $(src)/../include/asm/trace
obj-y += head_$(BITS).o
obj-y += head$(BITS).o
@@ -62,7 +66,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
-obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
+obj-y += pci-dma.o quirks.o kdebugfs.o
obj-y += alternative.o i8253.o hw_breakpoint.o
obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
obj-y += resource.o
@@ -142,8 +146,6 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
-obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o
-
obj-$(CONFIG_CFI_CLANG) += cfi.o
obj-$(CONFIG_CALL_THUNKS) += callthunks.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index fc17b3f136fe..842a5f449404 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o
+obj-$(CONFIG_ACPI_MADT_WAKEUP) += madt_wakeup.o madt_playdead.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4bf82dbd2a6b..4efecac49863 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -67,13 +67,6 @@ static bool has_lapic_cpus __initdata;
static bool acpi_support_online_capable;
#endif
-#ifdef CONFIG_X86_64
-/* Physical address of the Multiprocessor Wakeup Structure mailbox */
-static u64 acpi_mp_wake_mailbox_paddr;
-/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
-static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
-#endif
-
#ifdef CONFIG_X86_IO_APIC
/*
* Locks related to IOAPIC hotplug
@@ -341,60 +334,6 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
return 0;
}
-
-#ifdef CONFIG_X86_64
-static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
-{
- /*
- * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
- *
- * Wakeup of secondary CPUs is fully serialized in the core code.
- * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
- */
- if (!acpi_mp_wake_mailbox) {
- acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
- sizeof(*acpi_mp_wake_mailbox),
- MEMREMAP_WB);
- }
-
- /*
- * Mailbox memory is shared between the firmware and OS. Firmware will
- * listen on mailbox command address, and once it receives the wakeup
- * command, the CPU associated with the given apicid will be booted.
- *
- * The value of 'apic_id' and 'wakeup_vector' must be visible to the
- * firmware before the wakeup command is visible. smp_store_release()
- * ensures ordering and visibility.
- */
- acpi_mp_wake_mailbox->apic_id = apicid;
- acpi_mp_wake_mailbox->wakeup_vector = start_ip;
- smp_store_release(&acpi_mp_wake_mailbox->command,
- ACPI_MP_WAKE_COMMAND_WAKEUP);
-
- /*
- * Wait for the CPU to wake up.
- *
- * The CPU being woken up is essentially in a spin loop waiting to be
- * woken up. It should not take long for it wake up and acknowledge by
- * zeroing out ->command.
- *
- * ACPI specification doesn't provide any guidance on how long kernel
- * has to wait for a wake up acknowledgement. It also doesn't provide
- * a way to cancel a wake up request if it takes too long.
- *
- * In TDX environment, the VMM has control over how long it takes to
- * wake up secondary. It can postpone scheduling secondary vCPU
- * indefinitely. Giving up on wake up request and reporting error opens
- * possible attack vector for VMM: it can wake up a secondary CPU when
- * kernel doesn't expect it. Wait until positive result of the wake up
- * request.
- */
- while (READ_ONCE(acpi_mp_wake_mailbox->command))
- cpu_relax();
-
- return 0;
-}
-#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
@@ -1124,29 +1063,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
}
return 0;
}
-
-#ifdef CONFIG_X86_64
-static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
- const unsigned long end)
-{
- struct acpi_madt_multiproc_wakeup *mp_wake;
-
- if (!IS_ENABLED(CONFIG_SMP))
- return -ENODEV;
-
- mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
- if (BAD_MADT_ENTRY(mp_wake, end))
- return -EINVAL;
-
- acpi_table_print_madt_entry(&header->common);
-
- acpi_mp_wake_mailbox_paddr = mp_wake->base_address;
-
- apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
-
- return 0;
-}
-#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
@@ -1343,7 +1259,7 @@ static void __init acpi_process_madt(void)
smp_found_config = 1;
}
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_ACPI_MADT_WAKEUP
/*
* Parse MADT MP Wake entry.
*/
@@ -1862,3 +1778,14 @@ u64 x86_default_get_root_pointer(void)
{
return boot_params.acpi_rsdp_addr;
}
+
+#ifdef CONFIG_XEN_PV
+void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
+{
+ return ioremap_cache(phys, size);
+}
+
+void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, acpi_size size) =
+ x86_acpi_os_ioremap;
+EXPORT_SYMBOL_GPL(acpi_os_ioremap);
+#endif
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index ff8f25faca3d..956984054bf3 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -9,6 +9,17 @@
#include <asm/processor.h>
#include <asm/topology.h>
+#define CPPC_HIGHEST_PERF_PERFORMANCE 196
+#define CPPC_HIGHEST_PERF_PREFCORE 166
+
+enum amd_pref_core {
+ AMD_PREF_CORE_UNKNOWN = 0,
+ AMD_PREF_CORE_SUPPORTED,
+ AMD_PREF_CORE_UNSUPPORTED,
+};
+static enum amd_pref_core amd_pref_core_detected;
+static u64 boost_numerator;
+
/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
bool cpc_supported_by_cpu(void)
@@ -69,31 +80,30 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
static void amd_set_max_freq_ratio(void)
{
struct cppc_perf_caps perf_caps;
- u64 highest_perf, nominal_perf;
+ u64 numerator, nominal_perf;
u64 perf_ratio;
int rc;
rc = cppc_get_perf_caps(0, &perf_caps);
if (rc) {
- pr_debug("Could not retrieve perf counters (%d)\n", rc);
+ pr_warn("Could not retrieve perf counters (%d)\n", rc);
return;
}
- highest_perf = amd_get_highest_perf();
+ rc = amd_get_boost_ratio_numerator(0, &numerator);
+ if (rc) {
+ pr_warn("Could not retrieve highest performance (%d)\n", rc);
+ return;
+ }
nominal_perf = perf_caps.nominal_perf;
- if (!highest_perf || !nominal_perf) {
- pr_debug("Could not retrieve highest or nominal performance\n");
+ if (!nominal_perf) {
+ pr_warn("Could not retrieve nominal performance\n");
return;
}
- perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
/* midpoint between max_boost and max_P */
- perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
- if (!perf_ratio) {
- pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
- return;
- }
+ perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1;
freq_invariance_set_perf_ratio(perf_ratio, false);
}
@@ -116,3 +126,143 @@ void init_freq_invariance_cppc(void)
init_done = true;
mutex_unlock(&freq_invariance_lock);
}
+
+/*
+ * Get the highest performance register value.
+ * @cpu: CPU from which to get highest performance.
+ * @highest_perf: Return address for highest performance value.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
+{
+ u64 val;
+ int ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
+ if (ret)
+ goto out;
+
+ val = AMD_CPPC_HIGHEST_PERF(val);
+ } else {
+ ret = cppc_get_highest_perf(cpu, &val);
+ if (ret)
+ goto out;
+ }
+
+ WRITE_ONCE(*highest_perf, (u32)val);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+
+/**
+ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores
+ * @detected: Output variable for the result of the detection.
+ *
+ * Determine whether CPUs in the system support preferred cores. On systems
+ * that support preferred cores, different highest perf values will be found
+ * on different cores. On other systems, the highest perf value will be the
+ * same on all cores.
+ *
+ * The result of the detection will be stored in the 'detected' parameter.
+ *
+ * Return: 0 for success, negative error code otherwise
+ */
+int amd_detect_prefcore(bool *detected)
+{
+ int cpu, count = 0;
+ u64 highest_perf[2] = {0};
+
+ if (WARN_ON(!detected))
+ return -EINVAL;
+
+ switch (amd_pref_core_detected) {
+ case AMD_PREF_CORE_SUPPORTED:
+ *detected = true;
+ return 0;
+ case AMD_PREF_CORE_UNSUPPORTED:
+ *detected = false;
+ return 0;
+ default:
+ break;
+ }
+
+ for_each_present_cpu(cpu) {
+ u32 tmp;
+ int ret;
+
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+
+ if (!count || (count == 1 && tmp != highest_perf[0]))
+ highest_perf[count++] = tmp;
+
+ if (count == 2)
+ break;
+ }
+
+ *detected = (count == 2);
+ boost_numerator = highest_perf[0];
+
+ amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED :
+ AMD_PREF_CORE_UNSUPPORTED;
+
+ pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n",
+ *detected ? "" : "un", highest_perf[0]);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_detect_prefcore);
+
+/**
+ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
+ * @cpu: CPU to get numerator for.
+ * @numerator: Output variable for numerator.
+ *
+ * Determine the numerator to use for calculating the boost ratio on
+ * a CPU. On systems that support preferred cores, this will be a hardcoded
+ * value. On other systems this will the highest performance register value.
+ *
+ * If booting the system with amd-pstate enabled but preferred cores disabled then
+ * the correct boost numerator will be returned to match hardware capabilities
+ * even if the preferred cores scheduling hints are not enabled.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+{
+ bool prefcore;
+ int ret;
+
+ ret = amd_detect_prefcore(&prefcore);
+ if (ret)
+ return ret;
+
+ /* without preferred cores, return the highest perf register value */
+ if (!prefcore) {
+ *numerator = boost_numerator;
+ return 0;
+ }
+
+ /*
+ * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
+ * the highest performance level is set to 196.
+ * https://bugzilla.kernel.org/show_bug.cgi?id=218759
+ */
+ if (cpu_feature_enabled(X86_FEATURE_ZEN4)) {
+ switch (boot_cpu_data.x86_model) {
+ case 0x70 ... 0x7f:
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ default:
+ break;
+ }
+ }
+ *numerator = CPPC_HIGHEST_PERF_PREFCORE;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator);
diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S
new file mode 100644
index 000000000000..4e498d28cdc8
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_playdead.S
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/nospec-branch.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+
+ .text
+ .align PAGE_SIZE
+
+/*
+ * asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS
+ *
+ * rdi: Address of the ACPI MADT MPWK ResetVector
+ * rsi: PGD of the identity mapping
+ */
+SYM_FUNC_START(asm_acpi_mp_play_dead)
+ /* Turn off global entries. Following CR3 write will flush them. */
+ movq %cr4, %rdx
+ andq $~(X86_CR4_PGE), %rdx
+ movq %rdx, %cr4
+
+ /* Switch to identity mapping */
+ movq %rsi, %cr3
+
+ /* Jump to reset vector */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rdi
+SYM_FUNC_END(asm_acpi_mp_play_dead)
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
new file mode 100644
index 000000000000..d5ef6215583b
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/acpi.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+#include <linux/pgtable.h>
+#include <linux/sched/hotplug.h>
+#include <asm/apic.h>
+#include <asm/barrier.h>
+#include <asm/init.h>
+#include <asm/intel_pt.h>
+#include <asm/nmi.h>
+#include <asm/processor.h>
+#include <asm/reboot.h>
+
+/* Physical address of the Multiprocessor Wakeup Structure mailbox */
+static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
+
+/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
+static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
+
+static u64 acpi_mp_pgd __ro_after_init;
+static u64 acpi_mp_reset_vector_paddr __ro_after_init;
+
+static void acpi_mp_stop_this_cpu(void)
+{
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_play_dead(void)
+{
+ play_dead_common();
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_cpu_die(unsigned int cpu)
+{
+ u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned long timeout;
+
+ /*
+ * Use TEST mailbox command to prove that BIOS got control over
+ * the CPU before declaring it dead.
+ *
+ * BIOS has to clear 'command' field of the mailbox.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_TEST);
+
+ /* Don't wait longer than a second. */
+ timeout = USEC_PER_SEC;
+ while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
+ udelay(1);
+
+ if (!timeout)
+ pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
+}
+
+/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
+static void __init *alloc_pgt_page(void *dummy)
+{
+ return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static void __init free_pgt_page(void *pgt, void *dummy)
+{
+ return memblock_free(pgt, PAGE_SIZE);
+}
+
+/*
+ * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
+ * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
+ * to the identity mapping and the function has be present at the same spot in
+ * the virtual address space before and after switching page tables.
+ */
+static int __init init_transition_pgtable(pgd_t *pgd)
+{
+ pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
+ unsigned long vaddr, paddr;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ vaddr = (unsigned long)asm_acpi_mp_play_dead;
+ pgd += pgd_index(vaddr);
+ if (!pgd_present(*pgd)) {
+ p4d = (p4d_t *)alloc_pgt_page(NULL);
+ if (!p4d)
+ return -ENOMEM;
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+ p4d = p4d_offset(pgd, vaddr);
+ if (!p4d_present(*p4d)) {
+ pud = (pud_t *)alloc_pgt_page(NULL);
+ if (!pud)
+ return -ENOMEM;
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ }
+ pud = pud_offset(p4d, vaddr);
+ if (!pud_present(*pud)) {
+ pmd = (pmd_t *)alloc_pgt_page(NULL);
+ if (!pmd)
+ return -ENOMEM;
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (!pmd_present(*pmd)) {
+ pte = (pte_t *)alloc_pgt_page(NULL);
+ if (!pte)
+ return -ENOMEM;
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+
+ paddr = __pa(vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+
+ return 0;
+}
+
+static int __init acpi_mp_setup_reset(u64 reset_vector)
+{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .free_pgt_page = free_pgt_page,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
+ };
+ pgd_t *pgd;
+
+ pgd = alloc_pgt_page(NULL);
+ if (!pgd)
+ return -ENOMEM;
+
+ for (int i = 0; i < nr_pfn_mapped; i++) {
+ unsigned long mstart, mend;
+
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+ }
+
+ if (kernel_ident_mapping_init(&info, pgd,
+ PAGE_ALIGN_DOWN(reset_vector),
+ PAGE_ALIGN(reset_vector + 1))) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ if (init_transition_pgtable(pgd)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ smp_ops.play_dead = acpi_mp_play_dead;
+ smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
+ smp_ops.cpu_die = acpi_mp_cpu_die;
+
+ acpi_mp_reset_vector_paddr = reset_vector;
+ acpi_mp_pgd = __pa(pgd);
+
+ return 0;
+}
+
+static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
+{
+ if (!acpi_mp_wake_mailbox_paddr) {
+ pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
+ *
+ * Wakeup of secondary CPUs is fully serialized in the core code.
+ * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
+ */
+ if (!acpi_mp_wake_mailbox) {
+ acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
+ sizeof(*acpi_mp_wake_mailbox),
+ MEMREMAP_WB);
+ }
+
+ /*
+ * Mailbox memory is shared between the firmware and OS. Firmware will
+ * listen on mailbox command address, and once it receives the wakeup
+ * command, the CPU associated with the given apicid will be booted.
+ *
+ * The value of 'apic_id' and 'wakeup_vector' must be visible to the
+ * firmware before the wakeup command is visible. smp_store_release()
+ * ensures ordering and visibility.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ acpi_mp_wake_mailbox->wakeup_vector = start_ip;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_WAKEUP);
+
+ /*
+ * Wait for the CPU to wake up.
+ *
+ * The CPU being woken up is essentially in a spin loop waiting to be
+ * woken up. It should not take long for it wake up and acknowledge by
+ * zeroing out ->command.
+ *
+ * ACPI specification doesn't provide any guidance on how long kernel
+ * has to wait for a wake up acknowledgment. It also doesn't provide
+ * a way to cancel a wake up request if it takes too long.
+ *
+ * In TDX environment, the VMM has control over how long it takes to
+ * wake up secondary. It can postpone scheduling secondary vCPU
+ * indefinitely. Giving up on wake up request and reporting error opens
+ * possible attack vector for VMM: it can wake up a secondary CPU when
+ * kernel doesn't expect it. Wait until positive result of the wake up
+ * request.
+ */
+ while (READ_ONCE(acpi_mp_wake_mailbox->command))
+ cpu_relax();
+
+ return 0;
+}
+
+static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
+{
+ cpu_hotplug_disable_offlining();
+
+ /*
+ * ACPI MADT doesn't allow to offline a CPU after it was onlined. This
+ * limits kexec: the second kernel won't be able to use more than one CPU.
+ *
+ * To prevent a kexec kernel from onlining secondary CPUs invalidate the
+ * mailbox address in the ACPI MADT wakeup structure which prevents a
+ * kexec kernel to use it.
+ *
+ * This is safe as the booting kernel has the mailbox address cached
+ * already and acpi_wakeup_cpu() uses the cached value to bring up the
+ * secondary CPUs.
+ *
+ * Note: This is a Linux specific convention and not covered by the
+ * ACPI specification.
+ */
+ mp_wake->mailbox_address = 0;
+}
+
+int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
+ const unsigned long end)
+{
+ struct acpi_madt_multiproc_wakeup *mp_wake;
+
+ mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
+
+ /*
+ * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
+ * entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
+ * than the actual size of the MP wakeup entry in ACPI table because the
+ * 'reset_vector' is only available in the V1 MP wakeup structure.
+ */
+ if (!mp_wake)
+ return -EINVAL;
+ if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
+ return -EINVAL;
+ if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(&header->common);
+
+ acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
+
+ if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
+ mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
+ if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
+ pr_warn("Failed to setup MADT reset vector\n");
+ acpi_mp_disable_offlining(mp_wake);
+ }
+ } else {
+ /*
+ * CPU offlining requires version 1 of the ACPI MADT wakeup
+ * structure.
+ */
+ acpi_mp_disable_offlining(mp_wake);
+ }
+
+ apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
+
+ return 0;
+}
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 45a280f2161c..d17518ca19b8 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -125,6 +125,20 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
};
/*
+ * Nomenclature for variable names to simplify and clarify this code and ease
+ * any potential staring at it:
+ *
+ * @instr: source address of the original instructions in the kernel text as
+ * generated by the compiler.
+ *
+ * @buf: temporary buffer on which the patching operates. This buffer is
+ * eventually text-poked into the kernel image.
+ *
+ * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
+ * in the .altinstr_replacement section.
+ */
+
+/*
* Fill the buffer with a single effective instruction of size @len.
*
* In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
@@ -133,28 +147,28 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
* each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
* *jump* over instead of executing long and daft NOPs.
*/
-static void add_nop(u8 *instr, unsigned int len)
+static void add_nop(u8 *buf, unsigned int len)
{
- u8 *target = instr + len;
+ u8 *target = buf + len;
if (!len)
return;
if (len <= ASM_NOP_MAX) {
- memcpy(instr, x86_nops[len], len);
+ memcpy(buf, x86_nops[len], len);
return;
}
if (len < 128) {
- __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
- instr += JMP8_INSN_SIZE;
+ __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
+ buf += JMP8_INSN_SIZE;
} else {
- __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
- instr += JMP32_INSN_SIZE;
+ __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
+ buf += JMP32_INSN_SIZE;
}
- for (;instr < target; instr++)
- *instr = INT3_INSN_OPCODE;
+ for (;buf < target; buf++)
+ *buf = INT3_INSN_OPCODE;
}
extern s32 __retpoline_sites[], __retpoline_sites_end[];
@@ -187,12 +201,12 @@ static bool insn_is_nop(struct insn *insn)
* Find the offset of the first non-NOP instruction starting at @offset
* but no further than @len.
*/
-static int skip_nops(u8 *instr, int offset, int len)
+static int skip_nops(u8 *buf, int offset, int len)
{
struct insn insn;
for (; offset < len; offset += insn.length) {
- if (insn_decode_kernel(&insn, &instr[offset]))
+ if (insn_decode_kernel(&insn, &buf[offset]))
break;
if (!insn_is_nop(&insn))
@@ -203,66 +217,32 @@ static int skip_nops(u8 *instr, int offset, int len)
}
/*
- * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
- * to the end of the NOP sequence into a single NOP.
- */
-static bool
-__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
-{
- int i = *next - insn->length;
-
- switch (insn->opcode.bytes[0]) {
- case JMP8_INSN_OPCODE:
- case JMP32_INSN_OPCODE:
- *prev = i;
- *target = *next + insn->immediate.value;
- return false;
- }
-
- if (insn_is_nop(insn)) {
- int nop = i;
-
- *next = skip_nops(instr, *next, len);
- if (*target && *next == *target)
- nop = *prev;
-
- add_nop(instr + nop, *next - nop);
- DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
- return true;
- }
-
- *target = 0;
- return false;
-}
-
-/*
* "noinline" to cause control flow change and thus invalidate I$ and
* cause refetch after modification.
*/
-static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
+static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
{
- int prev, target = 0;
-
for (int next, i = 0; i < len; i = next) {
struct insn insn;
- if (insn_decode_kernel(&insn, &instr[i]))
+ if (insn_decode_kernel(&insn, &buf[i]))
return;
next = i + insn.length;
- __optimize_nops(instr, len, &insn, &next, &prev, &target);
- }
-}
+ if (insn_is_nop(&insn)) {
+ int nop = i;
-static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len)
-{
- unsigned long flags;
+ /* Has the NOP already been optimized? */
+ if (i + insn.length == len)
+ return;
- local_irq_save(flags);
- optimize_nops(instr, len);
- sync_core();
- local_irq_restore(flags);
+ next = skip_nops(buf, next, len);
+
+ add_nop(buf + nop, next - nop);
+ DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
+ }
+ }
}
/*
@@ -335,11 +315,9 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
return (target < src || target > src + src_len);
}
-void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
{
- int prev, target = 0;
-
- for (int next, i = 0; i < len; i = next) {
+ for (int next, i = 0; i < instrlen; i = next) {
struct insn insn;
if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
@@ -347,9 +325,6 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
next = i + insn.length;
- if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
- continue;
-
switch (insn.opcode.bytes[0]) {
case 0x0f:
if (insn.opcode.bytes[1] < 0x80 ||
@@ -361,10 +336,10 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
case JMP8_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case CALL_INSN_OPCODE:
- if (need_reloc(next + insn.immediate.value, src, src_len)) {
+ if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
apply_reloc(insn.immediate.nbytes,
buf + i + insn_offset_immediate(&insn),
- src - dest);
+ repl - instr);
}
/*
@@ -372,7 +347,7 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
*/
if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
s32 imm = insn.immediate.value;
- imm += src - dest;
+ imm += repl - instr;
imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
if ((imm >> 31) == (imm >> 7)) {
buf[i+0] = JMP8_INSN_OPCODE;
@@ -385,15 +360,21 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
}
if (insn_rip_relative(&insn)) {
- if (need_reloc(next + insn.displacement.value, src, src_len)) {
+ if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
apply_reloc(insn.displacement.nbytes,
buf + i + insn_offset_displacement(&insn),
- src - dest);
+ repl - instr);
}
}
}
}
+void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+{
+ __apply_relocation(buf, instr, instrlen, repl, repl_len);
+ optimize_nops(instr, buf, instrlen);
+}
+
/* Low-level backend functions usable from alternative code replacements. */
DEFINE_ASM_FUNC(nop_func, "", .entry.text);
EXPORT_SYMBOL_GPL(nop_func);
@@ -451,6 +432,11 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
return 5;
}
+static inline u8 * instr_va(struct alt_instr *i)
+{
+ return (u8 *)&i->instr_offset + i->instr_offset;
+}
+
/*
* Replace instructions with better alternatives for this CPU type. This runs
* before SMP is initialized to avoid SMP problems with self modifying code.
@@ -464,9 +450,9 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
void __init_or_module noinline apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
- struct alt_instr *a;
- u8 *instr, *replacement;
u8 insn_buff[MAX_PATCH_LEN];
+ u8 *instr, *replacement;
+ struct alt_instr *a, *b;
DPRINTK(ALT, "alt table %px, -> %px", start, end);
@@ -492,7 +478,18 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
for (a = start; a < end; a++) {
int insn_buff_sz = 0;
- instr = (u8 *)&a->instr_offset + a->instr_offset;
+ /*
+ * In case of nested ALTERNATIVE()s the outer alternative might
+ * add more padding. To ensure consistent patching find the max
+ * padding for all alt_instr entries for this site (nested
+ * alternatives result in consecutive entries).
+ */
+ for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) {
+ u8 len = max(a->instrlen, b->instrlen);
+ a->instrlen = b->instrlen = len;
+ }
+
+ instr = instr_va(a);
replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->instrlen > sizeof(insn_buff));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
@@ -504,7 +501,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* patch if feature is *NOT* present.
*/
if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
- optimize_nops_inplace(instr, a->instrlen);
+ memcpy(insn_buff, instr, a->instrlen);
+ optimize_nops(instr, insn_buff, a->instrlen);
+ text_poke_early(instr, insn_buff, a->instrlen);
continue;
}
@@ -526,7 +525,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
- apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
+ apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
@@ -761,7 +760,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
len = patch_retpoline(addr, &insn, bytes);
if (len == insn.length) {
- optimize_nops(bytes, len);
+ optimize_nops(addr, bytes, len);
DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
text_poke_early(addr, bytes, len);
@@ -902,8 +901,8 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
#endif /* CONFIG_X86_KERNEL_IBT */
-#ifdef CONFIG_FINEIBT
-#define __CFI_DEFAULT CFI_DEFAULT
+#ifdef CONFIG_CFI_AUTO_DEFAULT
+#define __CFI_DEFAULT CFI_AUTO
#elif defined(CONFIG_CFI_CLANG)
#define __CFI_DEFAULT CFI_KCFI
#else
@@ -1011,7 +1010,7 @@ static __init int cfi_parse_cmdline(char *str)
}
if (!strcmp(str, "auto")) {
- cfi_mode = CFI_DEFAULT;
+ cfi_mode = CFI_AUTO;
} else if (!strcmp(str, "off")) {
cfi_mode = CFI_OFF;
cfi_rand = false;
@@ -1271,7 +1270,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
"FineIBT preamble wrong size: %ld", fineibt_preamble_size))
return;
- if (cfi_mode == CFI_DEFAULT) {
+ if (cfi_mode == CFI_AUTO) {
cfi_mode = CFI_KCFI;
if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
cfi_mode = CFI_FINEIBT;
@@ -1658,7 +1657,7 @@ static noinline void __init alt_reloc_selftest(void)
*/
asm_inline volatile (
ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
- : /* output */
+ : ASM_CALL_CONSTRAINT
: [mem] "m" (__alt_reloc_selftest_addr)
: _ASM_ARG1
);
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 2ae98f754e59..c884deca839b 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -676,7 +676,7 @@ static const struct dma_map_ops gart_dma_ops = {
.get_sgtable = dma_common_get_sgtable,
.dma_supported = dma_direct_supported,
.get_required_mask = dma_direct_get_required_mask,
- .alloc_pages = dma_direct_alloc_pages,
+ .alloc_pages_op = dma_direct_alloc_pages,
.free_pages = dma_direct_free_pages,
};
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 5bf5f9fc5753..dc5d3216af24 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -26,6 +26,7 @@
#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8
#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a
#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507
+#define PCI_DEVICE_ID_AMD_1AH_M60H_ROOT 0x1122
#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb
#define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8
@@ -43,6 +44,8 @@
#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4
#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc
#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4
+#define PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4 0x124c
+#define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4 0x12bc
#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4
#define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c
@@ -63,6 +66,7 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) },
{}
@@ -95,6 +99,8 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) },
{}
@@ -121,6 +127,8 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) },
{}
@@ -179,6 +187,43 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev,
return dev;
}
+/*
+ * SMN accesses may fail in ways that are difficult to detect here in the called
+ * functions amd_smn_read() and amd_smn_write(). Therefore, callers must do
+ * their own checking based on what behavior they expect.
+ *
+ * For SMN reads, the returned value may be zero if the register is Read-as-Zero.
+ * Or it may be a "PCI Error Response", e.g. all 0xFFs. The "PCI Error Response"
+ * can be checked here, and a proper error code can be returned.
+ *
+ * But the Read-as-Zero response cannot be verified here. A value of 0 may be
+ * correct in some cases, so callers must check that this correct is for the
+ * register/fields they need.
+ *
+ * For SMN writes, success can be determined through a "write and read back"
+ * However, this is not robust when done here.
+ *
+ * Possible issues:
+ *
+ * 1) Bits that are "Write-1-to-Clear". In this case, the read value should
+ * *not* match the write value.
+ *
+ * 2) Bits that are "Read-as-Zero"/"Writes-Ignored". This information cannot be
+ * known here.
+ *
+ * 3) Bits that are "Reserved / Set to 1". Ditto above.
+ *
+ * Callers of amd_smn_write() should do the "write and read back" check
+ * themselves, if needed.
+ *
+ * For #1, they can see if their target bits got cleared.
+ *
+ * For #2 and #3, they can check if their target bits got set as intended.
+ *
+ * This matches what is done for RDMSR/WRMSR. As long as there's no #GP, then
+ * the operation is considered a success, and the caller does their own
+ * checking.
+ */
static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write)
{
struct pci_dev *root;
@@ -201,9 +246,6 @@ static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write)
err = (write ? pci_write_config_dword(root, 0x64, *value)
: pci_read_config_dword(root, 0x64, value));
- if (err)
- pr_warn("Error %s SMN address 0x%x.\n",
- (write ? "writing to" : "reading from"), address);
out_unlock:
mutex_unlock(&smn_mutex);
@@ -212,13 +254,20 @@ out:
return err;
}
-int amd_smn_read(u16 node, u32 address, u32 *value)
+int __must_check amd_smn_read(u16 node, u32 address, u32 *value)
{
- return __amd_smn_rw(node, address, value, false);
+ int err = __amd_smn_rw(node, address, value, false);
+
+ if (PCI_POSSIBLE_ERROR(*value)) {
+ err = -ENODEV;
+ *value = 0;
+ }
+
+ return err;
}
EXPORT_SYMBOL_GPL(amd_smn_read);
-int amd_smn_write(u16 node, u32 address, u32 value)
+int __must_check amd_smn_write(u16 node, u32 address, u32 value)
{
return __amd_smn_rw(node, address, &value, true);
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 803dcfb0e346..6513c53c9459 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -497,32 +497,32 @@ static struct clock_event_device lapic_clockevent = {
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
static const struct x86_cpu_id deadline_match[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */
+ X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */
+ X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */
- X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003),
+ X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011),
+ X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e),
+ X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c),
+ X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0),
+ X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136),
+ X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014),
+ X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0),
- X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22),
- X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20),
- X86_MATCH_INTEL_FAM6_MODEL( HASWELL_G, 0x17),
+ X86_MATCH_VFM(INTEL_HASWELL, 0x22),
+ X86_MATCH_VFM(INTEL_HASWELL_L, 0x20),
+ X86_MATCH_VFM(INTEL_HASWELL_G, 0x17),
- X86_MATCH_INTEL_FAM6_MODEL( BROADWELL, 0x25),
- X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_G, 0x17),
+ X86_MATCH_VFM(INTEL_BROADWELL, 0x25),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, 0x17),
- X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_L, 0xb2),
- X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE, 0xb2),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, 0xb2),
+ X86_MATCH_VFM(INTEL_SKYLAKE, 0xb2),
- X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE_L, 0x52),
- X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE, 0x52),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, 0x52),
+ X86_MATCH_VFM(INTEL_KABYLAKE, 0x52),
{},
};
@@ -631,7 +631,7 @@ void lapic_update_tsc_freq(void)
static __initdata int lapic_cal_loops = -1;
static __initdata long lapic_cal_t1, lapic_cal_t2;
static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
-static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata u32 lapic_cal_pm1, lapic_cal_pm2;
static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
/*
@@ -641,7 +641,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
{
unsigned long long tsc = 0;
long tapic = apic_read(APIC_TMCCT);
- unsigned long pm = acpi_pm_read_early();
+ u32 pm = acpi_pm_read_early();
if (boot_cpu_has(X86_FEATURE_TSC))
tsc = rdtsc();
@@ -666,7 +666,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
}
static int __init
-calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
+calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc)
{
const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
const long pm_thresh = pm_100ms / 100;
@@ -677,7 +677,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
return -1;
#endif
- apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
+ apic_pr_verbose("... PM-Timer delta = %u\n", deltapm);
/* Check, if the PM timer is available */
if (!deltapm)
@@ -687,14 +687,14 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
if (deltapm > (pm_100ms - pm_thresh) &&
deltapm < (pm_100ms + pm_thresh)) {
- apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
+ apic_pr_verbose("... PM-Timer result ok\n");
return 0;
}
res = (((u64)deltapm) * mult) >> 22;
do_div(res, 1000000);
- pr_warn("APIC calibration not consistent "
- "with PM-Timer: %ldms instead of 100ms\n", (long)res);
+ pr_warn("APIC calibration not consistent with PM-Timer: %ldms instead of 100ms\n",
+ (long)res);
/* Correct the lapic counter value */
res = (((u64)(*delta)) * pm_100ms);
@@ -707,9 +707,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
if (boot_cpu_has(X86_FEATURE_TSC)) {
res = (((u64)(*deltatsc)) * pm_100ms);
do_div(res, deltapm);
- apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
- "PM-Timer: %lu (%ld)\n",
- (unsigned long)res, *deltatsc);
+ apic_pr_verbose("TSC delta adjusted to PM-Timer: %lu (%ld)\n",
+ (unsigned long)res, *deltatsc);
*deltatsc = (long)res;
}
@@ -792,8 +791,7 @@ static int __init calibrate_APIC_clock(void)
* in the clockevent structure and return.
*/
if (!lapic_init_clockevent()) {
- apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n",
- lapic_timer_period);
+ apic_pr_verbose("lapic timer already calibrated %d\n", lapic_timer_period);
/*
* Direct calibration methods must have an always running
* local APIC timer, no need for broadcast timer.
@@ -802,8 +800,7 @@ static int __init calibrate_APIC_clock(void)
return 0;
}
- apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
- "calibrating APIC timer ...\n");
+ apic_pr_verbose("Using local APIC timer interrupts. Calibrating APIC timer ...\n");
/*
* There are platforms w/o global clockevent devices. Instead of
@@ -866,7 +863,7 @@ static int __init calibrate_APIC_clock(void)
/* Build delta t1-t2 as apic timer counts down */
delta = lapic_cal_t1 - lapic_cal_t2;
- apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+ apic_pr_verbose("... lapic delta = %ld\n", delta);
deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
@@ -877,22 +874,19 @@ static int __init calibrate_APIC_clock(void)
lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
lapic_init_clockevent();
- apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
- apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
- apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
- lapic_timer_period);
+ apic_pr_verbose("..... delta %ld\n", delta);
+ apic_pr_verbose("..... mult: %u\n", lapic_clockevent.mult);
+ apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period);
if (boot_cpu_has(X86_FEATURE_TSC)) {
- apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
- "%ld.%04ld MHz.\n",
- (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
- (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ apic_pr_verbose("..... CPU clock speed is %ld.%04ld MHz.\n",
+ (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
+ (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
}
- apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
- "%u.%04u MHz.\n",
- lapic_timer_period / (1000000 / HZ),
- lapic_timer_period % (1000000 / HZ));
+ apic_pr_verbose("..... host bus clock speed is %u.%04u MHz.\n",
+ lapic_timer_period / (1000000 / HZ),
+ lapic_timer_period % (1000000 / HZ));
/*
* Do a sanity check on the APIC calibration result
@@ -911,7 +905,7 @@ static int __init calibrate_APIC_clock(void)
* available.
*/
if (!pm_referenced && global_clock_event) {
- apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+ apic_pr_verbose("... verify APIC timer\n");
/*
* Setup the apic timer manually
@@ -932,11 +926,11 @@ static int __init calibrate_APIC_clock(void)
/* Jiffies delta */
deltaj = lapic_cal_j2 - lapic_cal_j1;
- apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+ apic_pr_verbose("... jiffies delta = %lu\n", deltaj);
/* Check, if the jiffies result is consistent */
if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
- apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+ apic_pr_verbose("... jiffies result ok\n");
else
levt->features |= CLOCK_EVT_FEAT_DUMMY;
}
@@ -1221,9 +1215,8 @@ void __init sync_Arb_IDs(void)
*/
apic_wait_icr_idle();
- apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
- apic_write(APIC_ICR, APIC_DEST_ALLINC |
- APIC_INT_LEVELTRIG | APIC_DM_INIT);
+ apic_pr_debug("Synchronizing Arb IDs.\n");
+ apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
}
enum apic_intr_mode_id apic_intr_mode __ro_after_init;
@@ -1409,10 +1402,10 @@ static void lapic_setup_esr(void)
if (maxlvt > 3)
apic_write(APIC_ESR, 0);
value = apic_read(APIC_ESR);
- if (value != oldvalue)
- apic_printk(APIC_VERBOSE, "ESR value before enabling "
- "vector: 0x%08x after: 0x%08x\n",
- oldvalue, value);
+ if (value != oldvalue) {
+ apic_pr_verbose("ESR value before enabling vector: 0x%08x after: 0x%08x\n",
+ oldvalue, value);
+ }
}
#define APIC_IR_REGS APIC_ISR_NR
@@ -1599,10 +1592,10 @@ static void setup_local_APIC(void)
value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
if (!cpu && (pic_mode || !value || ioapic_is_disabled)) {
value = APIC_DM_EXTINT;
- apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu);
+ apic_pr_verbose("Enabled ExtINT on CPU#%d\n", cpu);
} else {
value = APIC_DM_EXTINT | APIC_LVT_MASKED;
- apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu);
+ apic_pr_verbose("Masked ExtINT on CPU#%d\n", cpu);
}
apic_write(APIC_LVT0, value);
@@ -1775,12 +1768,9 @@ static __init void apic_set_fixmap(bool read_apic);
static __init void x2apic_disable(void)
{
- u32 x2apic_id, state = x2apic_state;
-
- x2apic_mode = 0;
- x2apic_state = X2APIC_DISABLED;
+ u32 x2apic_id;
- if (state != X2APIC_ON)
+ if (x2apic_state < X2APIC_ON)
return;
x2apic_id = read_apic_id();
@@ -1793,6 +1783,10 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
/*
* Don't reread the APIC ID as it was already done from
* check_x2apic() and the APIC driver still is a x2APIC variant,
@@ -2066,8 +2060,7 @@ static __init void apic_set_fixmap(bool read_apic)
{
set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
apic_mmio_base = APIC_BASE;
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- apic_mmio_base, mp_lapic_addr);
+ apic_pr_verbose("Mapped APIC to %16lx (%16lx)\n", apic_mmio_base, mp_lapic_addr);
if (read_apic)
apic_read_boot_cpu_id(false);
}
@@ -2170,18 +2163,17 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt)
apic_eoi();
atomic_inc(&irq_err_count);
- apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
- smp_processor_id(), v);
+ apic_pr_debug("APIC error on CPU%d: %02x", smp_processor_id(), v);
v &= 0xff;
while (v) {
if (v & 0x1)
- apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
+ apic_pr_debug_cont(" : %s", error_interrupt_reason[i]);
i++;
v >>= 1;
}
- apic_printk(APIC_DEBUG, KERN_CONT "\n");
+ apic_pr_debug_cont("\n");
trace_error_apic_exit(ERROR_APIC_VECTOR);
}
@@ -2201,8 +2193,7 @@ static void __init connect_bsp_APIC(void)
* PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
* local APIC to INT and NMI lines.
*/
- apic_printk(APIC_VERBOSE, "leaving PIC mode, "
- "enabling APIC mode.\n");
+ apic_pr_verbose("Leaving PIC mode, enabling APIC mode.\n");
imcr_pic_to_apic();
}
#endif
@@ -2227,8 +2218,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
* IPIs, won't work beyond this point! The only exception are
* INIT IPIs.
*/
- apic_printk(APIC_VERBOSE, "disabling APIC mode, "
- "entering PIC mode.\n");
+ apic_pr_verbose("Disabling APIC mode, entering PIC mode.\n");
imcr_apic_to_pic();
return;
}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f37ad3392fec..e0308d8c4e6c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -8,129 +8,25 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/cpumask.h>
#include <linux/export.h>
-#include <linux/acpi.h>
-#include <asm/jailhouse_para.h>
#include <asm/apic.h>
#include "local.h"
-static struct apic apic_physflat;
-static struct apic apic_flat;
-
-struct apic *apic __ro_after_init = &apic_flat;
-EXPORT_SYMBOL_GPL(apic);
-
-static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return 1;
-}
-
-static void _flat_send_IPI_mask(unsigned long mask, int vector)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
- local_irq_restore(flags);
-}
-
-static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
-{
- unsigned long mask = cpumask_bits(cpumask)[0];
-
- _flat_send_IPI_mask(mask, vector);
-}
-
-static void
-flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
-{
- unsigned long mask = cpumask_bits(cpumask)[0];
- int cpu = smp_processor_id();
-
- if (cpu < BITS_PER_LONG)
- __clear_bit(cpu, &mask);
-
- _flat_send_IPI_mask(mask, vector);
-}
-
-static u32 flat_get_apic_id(u32 x)
+static u32 physflat_get_apic_id(u32 x)
{
return (x >> 24) & 0xFF;
}
-static int flat_probe(void)
+static int physflat_probe(void)
{
return 1;
}
-static struct apic apic_flat __ro_after_init = {
- .name = "flat",
- .probe = flat_probe,
- .acpi_madt_oem_check = flat_acpi_madt_oem_check,
-
- .dest_mode_logical = true,
-
- .disable_esr = 0,
-
- .init_apic_ldr = default_init_apic_ldr,
- .cpu_present_to_apicid = default_cpu_present_to_apicid,
-
- .max_apic_id = 0xFE,
- .get_apic_id = flat_get_apic_id,
-
- .calc_dest_apicid = apic_flat_calc_apicid,
-
- .send_IPI = default_send_IPI_single,
- .send_IPI_mask = flat_send_IPI_mask,
- .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
- .send_IPI_allbutself = default_send_IPI_allbutself,
- .send_IPI_all = default_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
- .nmi_to_offline_cpu = true,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .eoi = native_apic_mem_eoi,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = apic_mem_wait_icr_idle,
- .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
-};
-
-/*
- * Physflat mode is used when there are more than 8 CPUs on a system.
- * We cannot use logical delivery in this case because the mask
- * overflows, so use physical mode.
- */
static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
-#ifdef CONFIG_ACPI
- /*
- * Quirk: some x86_64 machines can only use physical APIC mode
- * regardless of how many processors are present (x86_64 ES7000
- * is an example).
- */
- if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
- (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
- printk(KERN_DEBUG "system APIC only can use physical flat");
- return 1;
- }
-
- if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
- printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
- return 1;
- }
-#endif
-
- return 0;
-}
-
-static int physflat_probe(void)
-{
- return apic == &apic_physflat || num_possible_cpus() > 8 || jailhouse_paravirt();
+ return 1;
}
static struct apic apic_physflat __ro_after_init = {
@@ -146,7 +42,7 @@ static struct apic apic_physflat __ro_after_init = {
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.max_apic_id = 0xFE,
- .get_apic_id = flat_get_apic_id,
+ .get_apic_id = physflat_get_apic_id,
.calc_dest_apicid = apic_default_calc_apicid,
@@ -166,8 +62,7 @@ static struct apic apic_physflat __ro_after_init = {
.wait_icr_idle = apic_mem_wait_icr_idle,
.safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
};
+apic_driver(apic_physflat);
-/*
- * We need to check for physflat first, so this order is important.
- */
-apic_drivers(apic_physflat, apic_flat);
+struct apic *apic __ro_after_init = &apic_physflat;
+EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 477b740b2f26..1029ea4ac8ba 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -86,8 +86,8 @@ static unsigned int ioapic_dynirq_base;
static int ioapic_initialized;
struct irq_pin_list {
- struct list_head list;
- int apic, pin;
+ struct list_head list;
+ int apic, pin;
};
struct mp_chip_data {
@@ -96,7 +96,7 @@ struct mp_chip_data {
bool is_level;
bool active_low;
bool isa_irq;
- u32 count;
+ u32 count;
};
struct mp_ioapic_gsi {
@@ -105,21 +105,17 @@ struct mp_ioapic_gsi {
};
static struct ioapic {
- /*
- * # of IRQ routing registers
- */
- int nr_registers;
- /*
- * Saved state during suspend/resume, or while enabling intr-remap.
- */
- struct IO_APIC_route_entry *saved_registers;
+ /* # of IRQ routing registers */
+ int nr_registers;
+ /* Saved state during suspend/resume, or while enabling intr-remap. */
+ struct IO_APIC_route_entry *saved_registers;
/* I/O APIC config */
- struct mpc_ioapic mp_config;
+ struct mpc_ioapic mp_config;
/* IO APIC gsi routing info */
- struct mp_ioapic_gsi gsi_config;
- struct ioapic_domain_cfg irqdomain_cfg;
- struct irq_domain *irqdomain;
- struct resource *iomem_res;
+ struct mp_ioapic_gsi gsi_config;
+ struct ioapic_domain_cfg irqdomain_cfg;
+ struct irq_domain *irqdomain;
+ struct resource *iomem_res;
} ioapics[MAX_IO_APICS];
#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver
@@ -205,10 +201,9 @@ void mp_save_irq(struct mpc_intsrc *m)
{
int i;
- apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
- m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
- m->srcbusirq, m->dstapic, m->dstirq);
+ apic_pr_verbose("Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+ m->srcbusirq, m->dstapic, m->dstirq);
for (i = 0; i < mp_irq_entries; i++) {
if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
@@ -269,12 +264,14 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
writel(vector, &io_apic->eoi);
}
unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
writel(reg, &io_apic->index);
return readl(&io_apic->data);
}
@@ -300,14 +297,8 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
- struct IO_APIC_route_entry entry;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- entry = __ioapic_read_entry(apic, pin);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return entry;
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ return __ioapic_read_entry(apic, pin);
}
/*
@@ -324,11 +315,8 @@ static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e
static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
__ioapic_write_entry(apic, pin, e);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -339,12 +327,10 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
static void ioapic_mask_entry(int apic, int pin)
{
struct IO_APIC_route_entry e = { .masked = true };
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
io_apic_write(apic, 0x10 + 2*pin, e.w1);
io_apic_write(apic, 0x11 + 2*pin, e.w2);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -352,68 +338,39 @@ static void ioapic_mask_entry(int apic, int pin)
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static int __add_pin_to_irq_node(struct mp_chip_data *data,
- int node, int apic, int pin)
+static bool add_pin_to_irq_node(struct mp_chip_data *data, int node, int apic, int pin)
{
struct irq_pin_list *entry;
- /* don't allow duplicates */
- for_each_irq_pin(entry, data->irq_2_pin)
+ /* Don't allow duplicates */
+ for_each_irq_pin(entry, data->irq_2_pin) {
if (entry->apic == apic && entry->pin == pin)
- return 0;
+ return true;
+ }
entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
if (!entry) {
- pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
- node, apic, pin);
- return -ENOMEM;
+ pr_err("Cannot allocate irq_pin_list (%d,%d,%d)\n", node, apic, pin);
+ return false;
}
+
entry->apic = apic;
entry->pin = pin;
list_add_tail(&entry->list, &data->irq_2_pin);
-
- return 0;
+ return true;
}
static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
{
struct irq_pin_list *tmp, *entry;
- list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list)
+ list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) {
if (entry->apic == apic && entry->pin == pin) {
list_del(&entry->list);
kfree(entry);
return;
}
-}
-
-static void add_pin_to_irq_node(struct mp_chip_data *data,
- int node, int apic, int pin)
-{
- if (__add_pin_to_irq_node(data, node, apic, pin))
- panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
-}
-
-/*
- * Reroute an IRQ to a different pin.
- */
-static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
- int oldapic, int oldpin,
- int newapic, int newpin)
-{
- struct irq_pin_list *entry;
-
- for_each_irq_pin(entry, data->irq_2_pin) {
- if (entry->apic == oldapic && entry->pin == oldpin) {
- entry->apic = newapic;
- entry->pin = newpin;
- /* every one is different, right? */
- return;
- }
}
-
- /* old apic/pin didn't exist, so just add new ones */
- add_pin_to_irq_node(data, node, newapic, newpin);
}
static void io_apic_modify_irq(struct mp_chip_data *data, bool masked,
@@ -430,12 +387,12 @@ static void io_apic_modify_irq(struct mp_chip_data *data, bool masked,
}
}
+/*
+ * Synchronize the IO-APIC and the CPU by doing a dummy read from the
+ * IO-APIC
+ */
static void io_apic_sync(struct irq_pin_list *entry)
{
- /*
- * Synchronize the IO-APIC and the CPU by doing
- * a dummy read from the IO-APIC
- */
struct io_apic __iomem *io_apic;
io_apic = io_apic_base(entry->apic);
@@ -445,11 +402,9 @@ static void io_apic_sync(struct irq_pin_list *entry)
static void mask_ioapic_irq(struct irq_data *irq_data)
{
struct mp_chip_data *data = irq_data->chip_data;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
io_apic_modify_irq(data, true, &io_apic_sync);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void __unmask_ioapic(struct mp_chip_data *data)
@@ -460,11 +415,9 @@ static void __unmask_ioapic(struct mp_chip_data *data)
static void unmask_ioapic_irq(struct irq_data *irq_data)
{
struct mp_chip_data *data = irq_data->chip_data;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
__unmask_ioapic(data);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -492,30 +445,24 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector)
entry = entry1 = __ioapic_read_entry(apic, pin);
- /*
- * Mask the entry and change the trigger mode to edge.
- */
+ /* Mask the entry and change the trigger mode to edge. */
entry1.masked = true;
entry1.is_level = false;
__ioapic_write_entry(apic, pin, entry1);
- /*
- * Restore the previous level triggered entry.
- */
+ /* Restore the previous level triggered entry. */
__ioapic_write_entry(apic, pin, entry);
}
}
static void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
{
- unsigned long flags;
struct irq_pin_list *entry;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
for_each_irq_pin(entry, data->irq_2_pin)
__eoi_ioapic_pin(entry->apic, entry->pin, vector);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -538,8 +485,6 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
}
if (entry.irr) {
- unsigned long flags;
-
/*
* Make sure the trigger mode is set to level. Explicit EOI
* doesn't clear the remote-IRR if the trigger mode is not
@@ -549,9 +494,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
entry.is_level = true;
ioapic_write_entry(apic, pin, entry);
}
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
__eoi_ioapic_pin(apic, pin, entry.vector);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -586,28 +530,23 @@ static int pirq_entries[MAX_PIRQS] = {
static int __init ioapic_pirq_setup(char *str)
{
- int i, max;
- int ints[MAX_PIRQS+1];
+ int i, max, ints[MAX_PIRQS+1];
get_options(str, ARRAY_SIZE(ints), ints);
- apic_printk(APIC_VERBOSE, KERN_INFO
- "PIRQ redirection, working around broken MP-BIOS.\n");
+ apic_pr_verbose("PIRQ redirection, working around broken MP-BIOS.\n");
+
max = MAX_PIRQS;
if (ints[0] < MAX_PIRQS)
max = ints[0];
for (i = 0; i < max; i++) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
- /*
- * PIRQs are mapped upside down, usually.
- */
+ apic_pr_verbose("... PIRQ%d -> IRQ %d\n", i, ints[i + 1]);
+ /* PIRQs are mapped upside down, usually */
pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
}
return 1;
}
-
__setup("pirq=", ioapic_pirq_setup);
#endif /* CONFIG_X86_32 */
@@ -626,8 +565,7 @@ int save_ioapic_entries(void)
}
for_each_pin(apic, pin)
- ioapics[apic].saved_registers[pin] =
- ioapic_read_entry(apic, pin);
+ ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin);
}
return err;
@@ -668,8 +606,7 @@ int restore_ioapic_entries(void)
continue;
for_each_pin(apic, pin)
- ioapic_write_entry(apic, pin,
- ioapics[apic].saved_registers[pin]);
+ ioapic_write_entry(apic, pin, ioapics[apic].saved_registers[pin]);
}
return 0;
}
@@ -681,12 +618,13 @@ static int find_irq_entry(int ioapic_idx, int pin, int type)
{
int i;
- for (i = 0; i < mp_irq_entries; i++)
+ for (i = 0; i < mp_irq_entries; i++) {
if (mp_irqs[i].irqtype == type &&
(mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) ||
mp_irqs[i].dstapic == MP_APIC_ALL) &&
mp_irqs[i].dstirq == pin)
return i;
+ }
return -1;
}
@@ -701,10 +639,8 @@ static int __init find_isa_irq_pin(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].srcbus;
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].irqtype == type) &&
+ if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) &&
(mp_irqs[i].srcbusirq == irq))
-
return mp_irqs[i].dstirq;
}
return -1;
@@ -717,8 +653,7 @@ static int __init find_isa_irq_apic(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].srcbus;
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].irqtype == type) &&
+ if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) &&
(mp_irqs[i].srcbusirq == irq))
break;
}
@@ -726,9 +661,10 @@ static int __init find_isa_irq_apic(int irq, int type)
if (i < mp_irq_entries) {
int ioapic_idx;
- for_each_ioapic(ioapic_idx)
+ for_each_ioapic(ioapic_idx) {
if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
return ioapic_idx;
+ }
}
return -1;
@@ -769,8 +705,7 @@ static bool EISA_ELCR(unsigned int irq)
unsigned int port = PIC_ELCR1 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
- apic_printk(APIC_VERBOSE, KERN_INFO
- "Broken MPtable reports ISA irq %d\n", irq);
+ apic_pr_verbose("Broken MPtable reports ISA irq %d\n", irq);
return false;
}
@@ -947,9 +882,9 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
struct irq_alloc_info *info)
{
+ int type = ioapics[ioapic].irqdomain_cfg.type;
bool legacy = false;
int irq = -1;
- int type = ioapics[ioapic].irqdomain_cfg.type;
switch (type) {
case IOAPIC_DOMAIN_LEGACY:
@@ -971,8 +906,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
return -1;
}
- return __irq_domain_alloc_irqs(domain, irq, 1,
- ioapic_alloc_attr_node(info),
+ return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info),
info, legacy, NULL);
}
@@ -986,13 +920,12 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
* PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
* multiple pins sharing the same legacy IRQ number when ACPI is disabled.
*/
-static int alloc_isa_irq_from_domain(struct irq_domain *domain,
- int irq, int ioapic, int pin,
+static int alloc_isa_irq_from_domain(struct irq_domain *domain, int irq, int ioapic, int pin,
struct irq_alloc_info *info)
{
- struct mp_chip_data *data;
struct irq_data *irq_data = irq_get_irq_data(irq);
int node = ioapic_alloc_attr_node(info);
+ struct mp_chip_data *data;
/*
* Legacy ISA IRQ has already been allocated, just add pin to
@@ -1002,13 +935,11 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
if (irq_data && irq_data->parent_data) {
if (!mp_check_pin_attr(irq, info))
return -EBUSY;
- if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
- info->ioapic.pin))
+ if (!add_pin_to_irq_node(irq_data->chip_data, node, ioapic, info->ioapic.pin))
return -ENOMEM;
} else {
info->flags |= X86_IRQ_ALLOC_LEGACY;
- irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true,
- NULL);
+ irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, NULL);
if (irq >= 0) {
irq_data = irq_domain_get_irq_data(domain, irq);
data = irq_data->chip_data;
@@ -1022,11 +953,11 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
unsigned int flags, struct irq_alloc_info *info)
{
- int irq;
- bool legacy = false;
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
struct irq_alloc_info tmp;
struct mp_chip_data *data;
- struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ bool legacy = false;
+ int irq;
if (!domain)
return -ENOSYS;
@@ -1046,7 +977,7 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
return -EINVAL;
}
- mutex_lock(&ioapic_mutex);
+ guard(mutex)(&ioapic_mutex);
if (!(flags & IOAPIC_MAP_ALLOC)) {
if (!legacy) {
irq = irq_find_mapping(domain, pin);
@@ -1067,8 +998,6 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
data->count++;
}
}
- mutex_unlock(&ioapic_mutex);
-
return irq;
}
@@ -1076,26 +1005,20 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
{
u32 gsi = mp_pin_to_gsi(ioapic, pin);
- /*
- * Debugging check, we are in big trouble if this message pops up!
- */
+ /* Debugging check, we are in big trouble if this message pops up! */
if (mp_irqs[idx].dstirq != pin)
pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
#ifdef CONFIG_X86_32
- /*
- * PCI IRQ command line redirection. Yes, limits are hardcoded.
- */
+ /* PCI IRQ command line redirection. Yes, limits are hardcoded. */
if ((pin >= 16) && (pin <= 23)) {
- if (pirq_entries[pin-16] != -1) {
- if (!pirq_entries[pin-16]) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "disabling PIRQ%d\n", pin-16);
+ if (pirq_entries[pin - 16] != -1) {
+ if (!pirq_entries[pin - 16]) {
+ apic_pr_verbose("Disabling PIRQ%d\n", pin - 16);
} else {
int irq = pirq_entries[pin-16];
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "using PIRQ%d -> IRQ %d\n",
- pin-16, irq);
+
+ apic_pr_verbose("Using PIRQ%d -> IRQ %d\n", pin - 16, irq);
return irq;
}
}
@@ -1133,10 +1056,9 @@ void mp_unmap_irq(int irq)
if (!data || data->isa_irq)
return;
- mutex_lock(&ioapic_mutex);
+ guard(mutex)(&ioapic_mutex);
if (--data->count == 0)
irq_domain_free_irqs(irq, 1);
- mutex_unlock(&ioapic_mutex);
}
/*
@@ -1147,12 +1069,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
{
int irq, i, best_ioapic = -1, best_idx = -1;
- apic_printk(APIC_DEBUG,
- "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
- bus, slot, pin);
+ apic_pr_debug("Querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
if (test_bit(bus, mp_bus_not_pci)) {
- apic_printk(APIC_VERBOSE,
- "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ apic_pr_verbose("PCI BIOS passed nonexistent PCI bus %d!\n", bus);
return -1;
}
@@ -1197,8 +1117,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
return -1;
out:
- return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq,
- IOAPIC_MAP_ALLOC);
+ return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, IOAPIC_MAP_ALLOC);
}
EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
@@ -1209,17 +1128,16 @@ static void __init setup_IO_APIC_irqs(void)
unsigned int ioapic, pin;
int idx;
- apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+ apic_pr_verbose("Init IO_APIC IRQs\n");
for_each_ioapic_pin(ioapic, pin) {
idx = find_irq_entry(ioapic, pin, mp_INT);
- if (idx < 0)
- apic_printk(APIC_VERBOSE,
- KERN_DEBUG " apic %d pin %d not connected\n",
- mpc_ioapic_id(ioapic), pin);
- else
- pin_2_irq(idx, ioapic, pin,
- ioapic ? 0 : IOAPIC_MAP_ALLOC);
+ if (idx < 0) {
+ apic_pr_verbose("apic %d pin %d not connected\n",
+ mpc_ioapic_id(ioapic), pin);
+ } else {
+ pin_2_irq(idx, ioapic, pin, ioapic ? 0 : IOAPIC_MAP_ALLOC);
+ }
}
}
@@ -1234,26 +1152,21 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
char buf[256];
int i;
- printk(KERN_DEBUG "IOAPIC %d:\n", apic);
+ apic_dbg("IOAPIC %d:\n", apic);
for (i = 0; i <= nr_entries; i++) {
entry = ioapic_read_entry(apic, i);
- snprintf(buf, sizeof(buf),
- " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
- i,
- entry.masked ? "disabled" : "enabled ",
+ snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+ i, entry.masked ? "disabled" : "enabled ",
entry.is_level ? "level" : "edge ",
entry.active_low ? "low " : "high",
entry.vector, entry.irr, entry.delivery_status);
if (entry.ir_format) {
- printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n",
- buf,
- (entry.ir_index_15 << 15) | entry.ir_index_0_14,
- entry.ir_zero);
+ apic_dbg("%s, remapped, I(%04X), Z(%X)\n", buf,
+ (entry.ir_index_15 << 15) | entry.ir_index_0_14, entry.ir_zero);
} else {
- printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n", buf,
- entry.dest_mode_logical ? "logical " : "physical",
- entry.virt_destid_8_14, entry.destid_0_7,
- entry.delivery_mode);
+ apic_dbg("%s, %s, D(%02X%02X), M(%1d)\n", buf,
+ entry.dest_mode_logical ? "logical " : "physic al",
+ entry.virt_destid_8_14, entry.destid_0_7, entry.delivery_mode);
}
}
}
@@ -1264,30 +1177,25 @@ static void __init print_IO_APIC(int ioapic_idx)
union IO_APIC_reg_01 reg_01;
union IO_APIC_reg_02 reg_02;
union IO_APIC_reg_03 reg_03;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic_idx, 0);
- reg_01.raw = io_apic_read(ioapic_idx, 1);
- if (reg_01.bits.version >= 0x10)
- reg_02.raw = io_apic_read(ioapic_idx, 2);
- if (reg_01.bits.version >= 0x20)
- reg_03.raw = io_apic_read(ioapic_idx, 3);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
- printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
- printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
- printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
- printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
-
- printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
- printk(KERN_DEBUG "....... : max redirection entries: %02X\n",
- reg_01.bits.entries);
-
- printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
- printk(KERN_DEBUG "....... : IO APIC version: %02X\n",
- reg_01.bits.version);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ reg_01.raw = io_apic_read(ioapic_idx, 1);
+ if (reg_01.bits.version >= 0x10)
+ reg_02.raw = io_apic_read(ioapic_idx, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(ioapic_idx, 3);
+ }
+
+ apic_dbg("IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
+ apic_dbg(".... register #00: %08X\n", reg_00.raw);
+ apic_dbg("....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ apic_dbg("....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ apic_dbg("....... : LTS : %X\n", reg_00.bits.LTS);
+ apic_dbg(".... register #01: %08X\n", *(int *)&reg_01);
+ apic_dbg("....... : max redirection entries: %02X\n", reg_01.bits.entries);
+ apic_dbg("....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
+ apic_dbg("....... : IO APIC version: %02X\n", reg_01.bits.version);
/*
* Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1295,8 +1203,8 @@ static void __init print_IO_APIC(int ioapic_idx)
* value, so ignore it if reg_02 == reg_01.
*/
if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
- printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
+ apic_dbg(".... register #02: %08X\n", reg_02.raw);
+ apic_dbg("....... : arbitration: %02X\n", reg_02.bits.arbitration);
}
/*
@@ -1306,11 +1214,11 @@ static void __init print_IO_APIC(int ioapic_idx)
*/
if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
reg_03.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
- printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ apic_dbg(".... register #03: %08X\n", reg_03.raw);
+ apic_dbg("....... : Boot DT : %X\n", reg_03.bits.boot_DT);
}
- printk(KERN_DEBUG ".... IRQ redirection table:\n");
+ apic_dbg(".... IRQ redirection table:\n");
io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
}
@@ -1319,11 +1227,11 @@ void __init print_IO_APICs(void)
int ioapic_idx;
unsigned int irq;
- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
- for_each_ioapic(ioapic_idx)
- printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mpc_ioapic_id(ioapic_idx),
- ioapics[ioapic_idx].nr_registers);
+ apic_dbg("number of MP IRQ sources: %d.\n", mp_irq_entries);
+ for_each_ioapic(ioapic_idx) {
+ apic_dbg("number of IO-APIC #%d registers: %d.\n",
+ mpc_ioapic_id(ioapic_idx), ioapics[ioapic_idx].nr_registers);
+ }
/*
* We are a bit conservative about what we expect. We have to
@@ -1334,7 +1242,7 @@ void __init print_IO_APICs(void)
for_each_ioapic(ioapic_idx)
print_IO_APIC(ioapic_idx);
- printk(KERN_DEBUG "IRQ to pin mappings:\n");
+ apic_dbg("IRQ to pin mappings:\n");
for_each_active_irq(irq) {
struct irq_pin_list *entry;
struct irq_chip *chip;
@@ -1349,7 +1257,7 @@ void __init print_IO_APICs(void)
if (list_empty(&data->irq_2_pin))
continue;
- printk(KERN_DEBUG "IRQ%d ", irq);
+ apic_dbg("IRQ%d ", irq);
for_each_irq_pin(entry, data->irq_2_pin)
pr_cont("-> %d:%d", entry->apic, entry->pin);
pr_cont("\n");
@@ -1363,8 +1271,7 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
void __init enable_IO_APIC(void)
{
- int i8259_apic, i8259_pin;
- int apic, pin;
+ int i8259_apic, i8259_pin, apic, pin;
if (ioapic_is_disabled)
nr_ioapics = 0;
@@ -1376,19 +1283,21 @@ void __init enable_IO_APIC(void)
/* See if any of the pins is in ExtINT mode */
struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin);
- /* If the interrupt line is enabled and in ExtInt mode
- * I have found the pin where the i8259 is connected.
+ /*
+ * If the interrupt line is enabled and in ExtInt mode I
+ * have found the pin where the i8259 is connected.
*/
- if (!entry.masked &&
- entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) {
+ if (!entry.masked && entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) {
ioapic_i8259.apic = apic;
ioapic_i8259.pin = pin;
- goto found_i8259;
+ break;
}
}
- found_i8259:
- /* Look to see what if the MP table has reported the ExtINT */
- /* If we could not find the appropriate pin by looking at the ioapic
+
+ /*
+ * Look to see what if the MP table has reported the ExtINT
+ *
+ * If we could not find the appropriate pin by looking at the ioapic
* the i8259 probably is not connected the ioapic but give the
* mptable a chance anyway.
*/
@@ -1396,29 +1305,24 @@ void __init enable_IO_APIC(void)
i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
/* Trust the MP table if nothing is setup in the hardware */
if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
- printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+ pr_warn("ExtINT not setup in hardware but reported by MP table\n");
ioapic_i8259.pin = i8259_pin;
ioapic_i8259.apic = i8259_apic;
}
/* Complain if the MP table and the hardware disagree */
if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
- (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
- {
- printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
- }
+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+ pr_warn("ExtINT in hardware and MP table differ\n");
- /*
- * Do not trust the IO-APIC being empty at bootup
- */
+ /* Do not trust the IO-APIC being empty at bootup */
clear_IO_APIC();
}
void native_restore_boot_irq_mode(void)
{
/*
- * If the i8259 is routed through an IOAPIC
- * Put that IOAPIC in virtual wire mode
- * so legacy interrupts can be delivered.
+ * If the i8259 is routed through an IOAPIC Put that IOAPIC in
+ * virtual wire mode so legacy interrupts can be delivered.
*/
if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
@@ -1433,9 +1337,7 @@ void native_restore_boot_irq_mode(void)
entry.destid_0_7 = apic_id & 0xFF;
entry.virt_destid_8_14 = apic_id >> 8;
- /*
- * Add it to the IO-APIC irq-routing table:
- */
+ /* Add it to the IO-APIC irq-routing table */
ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
@@ -1464,7 +1366,6 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void)
const u32 broadcast_id = 0xF;
union IO_APIC_reg_00 reg_00;
unsigned char old_id;
- unsigned long flags;
int ioapic_idx, i;
/*
@@ -1478,9 +1379,8 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void)
*/
for_each_ioapic(ioapic_idx) {
/* Read the register 0 value */
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic_idx, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
old_id = mpc_ioapic_id(ioapic_idx);
@@ -1508,47 +1408,42 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void)
set_bit(i, phys_id_present_map);
ioapics[ioapic_idx].mp_config.apicid = i;
} else {
- apic_printk(APIC_VERBOSE, "Setting %d in the phys_id_present_map\n",
- mpc_ioapic_id(ioapic_idx));
+ apic_pr_verbose("Setting %d in the phys_id_present_map\n",
+ mpc_ioapic_id(ioapic_idx));
set_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map);
}
/*
- * We need to adjust the IRQ routing table
- * if the ID changed.
+ * We need to adjust the IRQ routing table if the ID
+ * changed.
*/
- if (old_id != mpc_ioapic_id(ioapic_idx))
- for (i = 0; i < mp_irq_entries; i++)
+ if (old_id != mpc_ioapic_id(ioapic_idx)) {
+ for (i = 0; i < mp_irq_entries; i++) {
if (mp_irqs[i].dstapic == old_id)
- mp_irqs[i].dstapic
- = mpc_ioapic_id(ioapic_idx);
+ mp_irqs[i].dstapic = mpc_ioapic_id(ioapic_idx);
+ }
+ }
/*
- * Update the ID register according to the right value
- * from the MPC table if they are different.
+ * Update the ID register according to the right value from
+ * the MPC table if they are different.
*/
if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID)
continue;
- apic_printk(APIC_VERBOSE, KERN_INFO
- "...changing IO-APIC physical APIC ID to %d ...",
- mpc_ioapic_id(ioapic_idx));
+ apic_pr_verbose("...changing IO-APIC physical APIC ID to %d ...",
+ mpc_ioapic_id(ioapic_idx));
reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic_idx, 0, reg_00.raw);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- /*
- * Sanity check
- */
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic_idx, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ }
+ /* Sanity check */
if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
pr_cont("could not set ID!\n");
else
- apic_printk(APIC_VERBOSE, " ok.\n");
+ apic_pr_verbose(" ok.\n");
}
}
@@ -1593,8 +1488,7 @@ static void __init delay_with_tsc(void)
do {
rep_nop();
now = rdtsc();
- } while ((now - start) < 40000000000ULL / HZ &&
- time_before_eq(jiffies, end));
+ } while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end));
}
static void __init delay_without_tsc(void)
@@ -1655,36 +1549,29 @@ static int __init timer_irq_works(void)
* so we 'resend' these IRQs via IPIs, to the same CPU. It's much
* better to do it this way as thus we do not have to be aware of
* 'pending' interrupts in the IRQ path, except at this point.
- */
-/*
- * Edge triggered needs to resend any interrupt
- * that was delayed but this is now handled in the device
- * independent code.
- */
-
-/*
- * Starting up a edge-triggered IO-APIC interrupt is
- * nasty - we need to make sure that we get the edge.
- * If it is already asserted for some reason, we need
- * return 1 to indicate that is was pending.
*
- * This is not complete - we should be able to fake
- * an edge even if it isn't on the 8259A...
+ *
+ * Edge triggered needs to resend any interrupt that was delayed but this
+ * is now handled in the device independent code.
+ *
+ * Starting up a edge-triggered IO-APIC interrupt is nasty - we need to
+ * make sure that we get the edge. If it is already asserted for some
+ * reason, we need return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake an edge even if it
+ * isn't on the 8259A...
*/
static unsigned int startup_ioapic_irq(struct irq_data *data)
{
int was_pending = 0, irq = data->irq;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
if (irq < nr_legacy_irqs()) {
legacy_pic->mask(irq);
if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
__unmask_ioapic(data->chip_data);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
return was_pending;
}
@@ -1694,9 +1581,8 @@ atomic_t irq_mis_count;
static bool io_apic_level_ack_pending(struct mp_chip_data *data)
{
struct irq_pin_list *entry;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
for_each_irq_pin(entry, data->irq_2_pin) {
struct IO_APIC_route_entry e;
int pin;
@@ -1704,13 +1590,9 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
pin = entry->pin;
e.w1 = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
- if (e.irr) {
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (e.irr)
return true;
- }
}
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
return false;
}
@@ -1728,7 +1610,8 @@ static inline bool ioapic_prepare_move(struct irq_data *data)
static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
if (unlikely(moveit)) {
- /* Only migrate the irq if the ack has been received.
+ /*
+ * Only migrate the irq if the ack has been received.
*
* On rare occasions the broadcast level triggered ack gets
* delayed going to ioapics, and if we reprogram the
@@ -1911,18 +1794,16 @@ static void ioapic_configure_entry(struct irq_data *irqd)
__ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
}
-static int ioapic_set_affinity(struct irq_data *irq_data,
- const struct cpumask *mask, bool force)
+static int ioapic_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force)
{
struct irq_data *parent = irq_data->parent_data;
- unsigned long flags;
int ret;
ret = parent->chip->irq_set_affinity(parent, mask, force);
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE)
ioapic_configure_entry(irq_data);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return ret;
}
@@ -1941,9 +1822,8 @@ static int ioapic_set_affinity(struct irq_data *irq_data,
*
* Verify that the corresponding Remote-IRR bits are clear.
*/
-static int ioapic_irq_get_chip_state(struct irq_data *irqd,
- enum irqchip_irq_state which,
- bool *state)
+static int ioapic_irq_get_chip_state(struct irq_data *irqd, enum irqchip_irq_state which,
+ bool *state)
{
struct mp_chip_data *mcd = irqd->chip_data;
struct IO_APIC_route_entry rentry;
@@ -1953,7 +1833,8 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd,
return -EINVAL;
*state = false;
- raw_spin_lock(&ioapic_lock);
+
+ guard(raw_spinlock)(&ioapic_lock);
for_each_irq_pin(p, mcd->irq_2_pin) {
rentry = __ioapic_read_entry(p->apic, p->pin);
/*
@@ -1967,7 +1848,6 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd,
break;
}
}
- raw_spin_unlock(&ioapic_lock);
return 0;
}
@@ -2008,14 +1888,13 @@ static inline void init_IO_APIC_traps(void)
cfg = irq_cfg(irq);
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
- * Hmm.. We don't have an entry for this,
- * so default to an old-fashioned 8259
- * interrupt if we can..
+ * Hmm.. We don't have an entry for this, so
+ * default to an old-fashioned 8259 interrupt if we
+ * can. Otherwise set the dummy interrupt chip.
*/
if (irq < nr_legacy_irqs())
legacy_pic->make_irq(irq);
else
- /* Strange. Oh, well.. */
irq_set_chip(irq, &no_irq_chip);
}
}
@@ -2024,20 +1903,17 @@ static inline void init_IO_APIC_traps(void)
/*
* The local APIC irq-chip implementation:
*/
-
static void mask_lapic_irq(struct irq_data *data)
{
- unsigned long v;
+ unsigned long v = apic_read(APIC_LVT0);
- v = apic_read(APIC_LVT0);
apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
}
static void unmask_lapic_irq(struct irq_data *data)
{
- unsigned long v;
+ unsigned long v = apic_read(APIC_LVT0);
- v = apic_read(APIC_LVT0);
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
@@ -2056,8 +1932,7 @@ static struct irq_chip lapic_chip __read_mostly = {
static void lapic_register_intr(int irq)
{
irq_clear_status_flags(irq, IRQ_LEVEL);
- irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
- "edge");
+ irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge");
}
/*
@@ -2069,9 +1944,9 @@ static void lapic_register_intr(int irq)
*/
static inline void __init unlock_ExtINT_logic(void)
{
- int apic, pin, i;
- struct IO_APIC_route_entry entry0, entry1;
unsigned char save_control, save_freq_select;
+ struct IO_APIC_route_entry entry0, entry1;
+ int apic, pin, i;
u32 apic_id;
pin = find_isa_irq_pin(8, mp_INT);
@@ -2131,10 +2006,10 @@ static int __init disable_timer_pin_setup(char *arg)
}
early_param("disable_timer_pin_1", disable_timer_pin_setup);
-static int mp_alloc_timer_irq(int ioapic, int pin)
+static int __init mp_alloc_timer_irq(int ioapic, int pin)
{
- int irq = -1;
struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ int irq = -1;
if (domain) {
struct irq_alloc_info info;
@@ -2142,21 +2017,36 @@ static int mp_alloc_timer_irq(int ioapic, int pin)
ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
info.devid = mpc_ioapic_id(ioapic);
info.ioapic.pin = pin;
- mutex_lock(&ioapic_mutex);
+ guard(mutex)(&ioapic_mutex);
irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
- mutex_unlock(&ioapic_mutex);
}
return irq;
}
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ return;
+ }
+ }
+
+ /* Old apic/pin didn't exist, so just add a new one */
+ add_pin_to_irq_node(data, node, newapic, newpin);
+}
+
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
* is so screwy. Thanks to Brian Perkins for testing/hacking this beast
* fanatically on his truly buggy board.
- *
- * FIXME: really need to revamp this for all platforms.
*/
static inline void __init check_timer(void)
{
@@ -2194,9 +2084,8 @@ static inline void __init check_timer(void)
pin2 = ioapic_i8259.pin;
apic2 = ioapic_i8259.apic;
- apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
- "apic1=%d pin1=%d apic2=%d pin2=%d\n",
- cfg->vector, apic1, pin1, apic2, pin2);
+ pr_info("..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+ cfg->vector, apic1, pin1, apic2, pin2);
/*
* Some BIOS writers are clueless and report the ExtINTA
@@ -2240,13 +2129,10 @@ static inline void __init check_timer(void)
panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC");
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
- apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
- "8254 timer not connected to IO-APIC\n");
+ pr_err("..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
- apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
- "(IRQ0) through the 8259A ...\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "..... (found apic %d pin %d) ...\n", apic2, pin2);
+ pr_info("...trying to set up timer (IRQ0) through the 8259A ...\n");
+ pr_info("..... (found apic %d pin %d) ...\n", apic2, pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
@@ -2255,7 +2141,7 @@ static inline void __init check_timer(void)
irq_domain_activate_irq(irq_data, false);
legacy_pic->unmask(0);
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+ pr_info("....... works.\n");
goto out;
}
/*
@@ -2263,26 +2149,24 @@ static inline void __init check_timer(void)
*/
legacy_pic->mask(0);
clear_IO_APIC_pin(apic2, pin2);
- apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
+ pr_info("....... failed.\n");
}
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as Virtual Wire IRQ...\n");
+ pr_info("...trying to set up timer as Virtual Wire IRQ...\n");
lapic_register_intr(0);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
legacy_pic->unmask(0);
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ pr_info("..... works.\n");
goto out;
}
legacy_pic->mask(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
- apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
+ pr_info("..... failed.\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as ExtINT IRQ...\n");
+ pr_info("...trying to set up timer as ExtINT IRQ...\n");
legacy_pic->init(0);
legacy_pic->make_irq(0);
@@ -2292,14 +2176,15 @@ static inline void __init check_timer(void)
unlock_ExtINT_logic();
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ pr_info("..... works.\n");
goto out;
}
- apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
- if (apic_is_x2apic_enabled())
- apic_printk(APIC_QUIET, KERN_INFO
- "Perhaps problem with the pre-enabled x2apic mode\n"
- "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+
+ pr_info("..... failed :\n");
+ if (apic_is_x2apic_enabled()) {
+ pr_info("Perhaps problem with the pre-enabled x2apic mode\n"
+ "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+ }
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
out:
@@ -2327,11 +2212,11 @@ out:
static int mp_irqdomain_create(int ioapic)
{
- struct irq_domain *parent;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
int hwirqs = mp_ioapic_pin_count(ioapic);
struct ioapic *ip = &ioapics[ioapic];
struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
- struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ struct irq_domain *parent;
struct fwnode_handle *fn;
struct irq_fwspec fwspec;
@@ -2367,10 +2252,8 @@ static int mp_irqdomain_create(int ioapic)
return -ENOMEM;
}
- if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
- cfg->type == IOAPIC_DOMAIN_STRICT)
- ioapic_dynirq_base = max(ioapic_dynirq_base,
- gsi_cfg->gsi_end + 1);
+ if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT)
+ ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1);
return 0;
}
@@ -2397,13 +2280,11 @@ void __init setup_IO_APIC(void)
io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
- apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+ apic_pr_verbose("ENABLING IO-APIC IRQs\n");
for_each_ioapic(ioapic)
BUG_ON(mp_irqdomain_create(ioapic));
- /*
- * Set up IO-APIC IRQ routing.
- */
+ /* Set up IO-APIC IRQ routing. */
x86_init.mpparse.setup_ioapic_ids();
sync_Arb_IDs();
@@ -2417,16 +2298,14 @@ void __init setup_IO_APIC(void)
static void resume_ioapic_id(int ioapic_idx)
{
- unsigned long flags;
union IO_APIC_reg_00 reg_00;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
reg_00.raw = io_apic_read(ioapic_idx, 0);
if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) {
reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
io_apic_write(ioapic_idx, 0, reg_00.raw);
}
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void ioapic_resume(void)
@@ -2440,8 +2319,8 @@ static void ioapic_resume(void)
}
static struct syscore_ops ioapic_syscore_ops = {
- .suspend = save_ioapic_entries,
- .resume = ioapic_resume,
+ .suspend = save_ioapic_entries,
+ .resume = ioapic_resume,
};
static int __init ioapic_init_ops(void)
@@ -2456,15 +2335,13 @@ device_initcall(ioapic_init_ops);
static int io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
reg_01.raw = io_apic_read(ioapic, 1);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- /* The register returns the maximum index redir index
- * supported, which is one less than the total number of redir
- * entries.
+ /*
+ * The register returns the maximum index redir index supported,
+ * which is one less than the total number of redir entries.
*/
return reg_01.bits.entries + 1;
}
@@ -2494,16 +2371,14 @@ static int io_apic_get_unique_id(int ioapic, int apic_id)
static DECLARE_BITMAP(apic_id_map, MAX_LOCAL_APIC);
const u32 broadcast_id = 0xF;
union IO_APIC_reg_00 reg_00;
- unsigned long flags;
int i = 0;
/* Initialize the ID map */
if (bitmap_empty(apic_id_map, MAX_LOCAL_APIC))
copy_phys_cpu_present_map(apic_id_map);
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(ioapic, 0);
if (apic_id >= broadcast_id) {
pr_warn("IOAPIC[%d]: Invalid apic_id %d, trying %d\n",
@@ -2530,21 +2405,19 @@ static int io_apic_get_unique_id(int ioapic, int apic_id)
if (reg_00.bits.ID != apic_id) {
reg_00.bits.ID = apic_id;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic, 0, reg_00.raw);
- reg_00.raw = io_apic_read(ioapic, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ }
/* Sanity check */
if (reg_00.bits.ID != apic_id) {
- pr_err("IOAPIC[%d]: Unable to change apic_id!\n",
- ioapic);
+ pr_err("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
return -1;
}
}
- apic_printk(APIC_VERBOSE, KERN_INFO
- "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+ apic_pr_verbose("IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
return apic_id;
}
@@ -2560,7 +2433,6 @@ static u8 io_apic_unique_id(int idx, u8 id)
{
union IO_APIC_reg_00 reg_00;
DECLARE_BITMAP(used, 256);
- unsigned long flags;
u8 new_id;
int i;
@@ -2576,26 +2448,23 @@ static u8 io_apic_unique_id(int idx, u8 id)
* Read the current id from the ioapic and keep it if
* available.
*/
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(idx, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(idx, 0);
+
new_id = reg_00.bits.ID;
if (!test_bit(new_id, used)) {
- apic_printk(APIC_VERBOSE, KERN_INFO
- "IOAPIC[%d]: Using reg apic_id %d instead of %d\n",
- idx, new_id, id);
+ apic_pr_verbose("IOAPIC[%d]: Using reg apic_id %d instead of %d\n",
+ idx, new_id, id);
return new_id;
}
- /*
- * Get the next free id and write it to the ioapic.
- */
+ /* Get the next free id and write it to the ioapic. */
new_id = find_first_zero_bit(used, 256);
reg_00.bits.ID = new_id;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(idx, 0, reg_00.raw);
- reg_00.raw = io_apic_read(idx, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(idx, 0);
+ }
/* Sanity check */
BUG_ON(reg_00.bits.ID != new_id);
@@ -2605,12 +2474,10 @@ static u8 io_apic_unique_id(int idx, u8 id)
static int io_apic_get_version(int ioapic)
{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
+ union IO_APIC_reg_01 reg_01;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
reg_01.raw = io_apic_read(ioapic, 1);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return reg_01.bits.version;
}
@@ -2625,8 +2492,8 @@ static struct resource *ioapic_resources;
static struct resource * __init ioapic_setup_resources(void)
{
- unsigned long n;
struct resource *res;
+ unsigned long n;
char *mem;
int i;
@@ -2686,9 +2553,7 @@ void __init io_apic_init_mappings(void)
ioapic_phys = mpc_ioapic_addr(i);
#ifdef CONFIG_X86_32
if (!ioapic_phys) {
- printk(KERN_ERR
- "WARNING: bogus zero IO-APIC "
- "address found in MPTABLE, "
+ pr_err("WARNING: bogus zero IO-APIC address found in MPTABLE, "
"disabling IO/APIC support!\n");
smp_found_config = 0;
ioapic_is_disabled = true;
@@ -2707,9 +2572,8 @@ fake_ioapic_page:
ioapic_phys = __pa(ioapic_phys);
}
io_apic_set_fixmap(idx, ioapic_phys);
- apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
- ioapic_phys);
+ apic_pr_verbose("mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys);
idx++;
ioapic_res->start = ioapic_phys;
@@ -2720,13 +2584,12 @@ fake_ioapic_page:
void __init ioapic_insert_resources(void)
{
- int i;
struct resource *r = ioapic_resources;
+ int i;
if (!r) {
if (nr_ioapics > 0)
- printk(KERN_ERR
- "IO APIC resources couldn't be allocated.\n");
+ pr_err("IO APIC resources couldn't be allocated.\n");
return;
}
@@ -2746,11 +2609,12 @@ int mp_find_ioapic(u32 gsi)
/* Find the IOAPIC that manages this GSI. */
for_each_ioapic(i) {
struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
+
if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end)
return i;
}
- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ pr_err("ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
return -1;
}
@@ -2789,12 +2653,10 @@ static int bad_ioapic_register(int idx)
static int find_free_ioapic_entry(void)
{
- int idx;
-
- for (idx = 0; idx < MAX_IO_APICS; idx++)
+ for (int idx = 0; idx < MAX_IO_APICS; idx++) {
if (ioapics[idx].nr_registers == 0)
return idx;
-
+ }
return MAX_IO_APICS;
}
@@ -2805,8 +2667,7 @@ static int find_free_ioapic_entry(void)
* @gsi_base: base of GSI associated with the IOAPIC
* @cfg: configuration information for the IOAPIC
*/
-int mp_register_ioapic(int id, u32 address, u32 gsi_base,
- struct ioapic_domain_cfg *cfg)
+int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg)
{
bool hotplug = !!ioapic_initialized;
struct mp_ioapic_gsi *gsi_cfg;
@@ -2817,12 +2678,13 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
pr_warn("Bogus (zero) I/O APIC address found, skipping!\n");
return -EINVAL;
}
- for_each_ioapic(ioapic)
+
+ for_each_ioapic(ioapic) {
if (ioapics[ioapic].mp_config.apicaddr == address) {
- pr_warn("address 0x%x conflicts with IOAPIC%d\n",
- address, ioapic);
+ pr_warn("address 0x%x conflicts with IOAPIC%d\n", address, ioapic);
return -EEXIST;
}
+ }
idx = find_free_ioapic_entry();
if (idx >= MAX_IO_APICS) {
@@ -2857,8 +2719,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
(gsi_end >= gsi_cfg->gsi_base &&
gsi_end <= gsi_cfg->gsi_end)) {
pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n",
- gsi_base, gsi_end,
- gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+ gsi_base, gsi_end, gsi_cfg->gsi_base, gsi_cfg->gsi_end);
clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
return -ENOSPC;
}
@@ -2892,8 +2753,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
ioapics[idx].nr_registers = entries;
pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
- idx, mpc_ioapic_id(idx),
- mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+ idx, mpc_ioapic_id(idx), mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
gsi_cfg->gsi_base, gsi_cfg->gsi_end);
return 0;
@@ -2904,11 +2764,13 @@ int mp_unregister_ioapic(u32 gsi_base)
int ioapic, pin;
int found = 0;
- for_each_ioapic(ioapic)
+ for_each_ioapic(ioapic) {
if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
found = 1;
break;
}
+ }
+
if (!found) {
pr_warn("can't find IOAPIC for GSI %d\n", gsi_base);
return -ENODEV;
@@ -2922,8 +2784,7 @@ int mp_unregister_ioapic(u32 gsi_base)
if (irq >= 0) {
data = irq_get_chip_data(irq);
if (data && data->count) {
- pr_warn("pin%d on IOAPIC%d is still in use.\n",
- pin, ioapic);
+ pr_warn("pin%d on IOAPIC%d is still in use.\n", pin, ioapic);
return -EBUSY;
}
}
@@ -2958,8 +2819,7 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
if (info && info->ioapic.valid) {
data->is_level = info->ioapic.is_level;
data->active_low = info->ioapic.active_low;
- } else if (__acpi_get_override_irq(gsi, &data->is_level,
- &data->active_low) < 0) {
+ } else if (__acpi_get_override_irq(gsi, &data->is_level, &data->active_low) < 0) {
/* PCI interrupts are always active low level triggered. */
data->is_level = true;
data->active_low = true;
@@ -3017,10 +2877,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
return -ENOMEM;
ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
- if (ret < 0) {
- kfree(data);
- return ret;
- }
+ if (ret < 0)
+ goto free_data;
INIT_LIST_HEAD(&data->irq_2_pin);
irq_data->hwirq = info->ioapic.pin;
@@ -3029,7 +2887,10 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
irq_data->chip_data = data;
mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
- add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+ if (!add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin)) {
+ ret = -ENOMEM;
+ goto free_irqs;
+ }
mp_preconfigure_entry(data);
mp_register_handler(virq, data->is_level);
@@ -3039,11 +2900,15 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
legacy_pic->mask(virq);
local_irq_restore(flags);
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n",
- ioapic, mpc_ioapic_id(ioapic), pin, virq,
- data->is_level, data->active_low);
+ apic_pr_verbose("IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n",
+ ioapic, mpc_ioapic_id(ioapic), pin, virq, data->is_level, data->active_low);
return 0;
+
+free_irqs:
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+free_data:
+ kfree(data);
+ return ret;
}
void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
@@ -3056,22 +2921,17 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
irq_data = irq_domain_get_irq_data(domain, virq);
if (irq_data && irq_data->chip_data) {
data = irq_data->chip_data;
- __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain),
- (int)irq_data->hwirq);
+ __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq);
WARN_ON(!list_empty(&data->irq_2_pin));
kfree(irq_data->chip_data);
}
irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
-int mp_irqdomain_activate(struct irq_domain *domain,
- struct irq_data *irq_data, bool reserve)
+int mp_irqdomain_activate(struct irq_domain *domain, struct irq_data *irq_data, bool reserve)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
ioapic_configure_entry(irq_data);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return 0;
}
@@ -3079,8 +2939,7 @@ void mp_irqdomain_deactivate(struct irq_domain *domain,
struct irq_data *irq_data)
{
/* It won't be called for IRQ with multiple IOAPIC pins associated */
- ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain),
- (int)irq_data->hwirq);
+ ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq);
}
int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index d9651f15ae4f..340769242dea 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -184,7 +184,6 @@ static int x86_msi_prepare(struct irq_domain *domain, struct device *dev,
alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
return 0;
case DOMAIN_BUS_PCI_DEVICE_MSIX:
- case DOMAIN_BUS_PCI_DEVICE_IMS:
alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
return 0;
default:
@@ -229,10 +228,6 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
case DOMAIN_BUS_PCI_DEVICE_MSI:
case DOMAIN_BUS_PCI_DEVICE_MSIX:
break;
- case DOMAIN_BUS_PCI_DEVICE_IMS:
- if (!(pops->supported_flags & MSI_FLAG_PCI_IMS))
- return false;
- break;
default:
WARN_ON_ONCE(1);
return false;
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 185738c72766..557318145038 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -965,7 +965,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
lockdep_assert_held(&vector_lock);
hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) {
- unsigned int irr, vector = apicd->prev_vector;
+ unsigned int vector = apicd->prev_vector;
/*
* Paranoia: Check if the vector that needs to be cleaned
@@ -979,8 +979,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
* fixup_irqs() was just called to scan IRR for set bits and
* forward them to new destination CPUs via IPIs.
*/
- irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0;
- if (irr & (1U << (vector % 32))) {
+ if (check_irr && is_vector_pending(vector)) {
pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq);
rearm = true;
continue;
@@ -1036,7 +1035,8 @@ static void __vector_schedule_cleanup(struct apic_chip_data *apicd)
add_timer_on(&cl->timer, cpu);
}
} else {
- apicd->prev_vector = 0;
+ pr_warn("IRQ %u schedule cleanup for offline CPU %u\n", apicd->irq, cpu);
+ free_moved_vector(apicd);
}
raw_spin_unlock(&vector_lock);
}
@@ -1073,6 +1073,7 @@ void irq_complete_move(struct irq_cfg *cfg)
*/
void irq_force_complete_move(struct irq_desc *desc)
{
+ unsigned int cpu = smp_processor_id();
struct apic_chip_data *apicd;
struct irq_data *irqd;
unsigned int vector;
@@ -1097,10 +1098,11 @@ void irq_force_complete_move(struct irq_desc *desc)
goto unlock;
/*
- * If prev_vector is empty, no action required.
+ * If prev_vector is empty or the descriptor is neither currently
+ * nor previously on the outgoing CPU no action required.
*/
vector = apicd->prev_vector;
- if (!vector)
+ if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
goto unlock;
/*
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 567dbd2fe4b6..7db83212effb 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -178,13 +178,16 @@ static int x2apic_prepare_cpu(unsigned int cpu)
u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
u32 cluster = apic_cluster(phys_apicid);
u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+ int node = cpu_to_node(cpu);
x86_cpu_to_logical_apicid[cpu] = logical_apicid;
- if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0)
+ if (alloc_clustermask(cpu, cluster, node) < 0)
return -ENOMEM;
- if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
+
+ if (!zalloc_cpumask_var_node(&per_cpu(ipi_mask, cpu), GFP_KERNEL, node))
return -ENOMEM;
+
return 0;
}
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index e92ff0c11db8..465647456753 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -185,8 +185,7 @@ static void *patch_dest(void *dest, bool direct)
u8 *pad = dest - tsize;
memcpy(insn_buff, skl_call_thunk_template, tsize);
- apply_relocation(insn_buff, tsize, pad,
- skl_call_thunk_template, tsize);
+ apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
/* Already patched? */
if (!bcmp(pad, insn_buff, tsize))
@@ -308,8 +307,7 @@ static bool is_callthunk(void *addr)
pad = (void *)(dest - tmpl_size);
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
- apply_relocation(insn_buff, tmpl_size, pad,
- skl_call_thunk_template, tmpl_size);
+ apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
return !bcmp(pad, insn_buff, tmpl_size);
}
@@ -327,8 +325,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip)
return 0;
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
- apply_relocation(insn_buff, tmpl_size, ip,
- skl_call_thunk_template, tmpl_size);
+ apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
memcpy(*pprog, insn_buff, tmpl_size);
*pprog += tmpl_size;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index eb4dbcdf41f1..5857a0f5d514 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -34,7 +34,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
ifdef CONFIG_CPU_SUP_INTEL
-obj-y += intel.o intel_pconfig.o tsx.o
+obj-y += intel.o tsx.o
obj-$(CONFIG_PM) += intel_epb.o
endif
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
@@ -60,7 +60,7 @@ obj-$(CONFIG_ACRN_GUEST) += acrn.o
obj-$(CONFIG_DEBUG_FS) += debugfs.o
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^
+ cmd_mkcapflags = $(CONFIG_SHELL) $(src)/mkcapflags.sh $@ $^
cpufeature = $(src)/../../include/asm/cpufeatures.h
vmxfeature = $(src)/../../include/asm/vmxfeatures.h
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 307302af0aee..015971adadfc 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -13,6 +13,7 @@
#include <asm/apic.h>
#include <asm/cacheinfo.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
#include <asm/spec-ctrl.h>
#include <asm/smp.h>
#include <asm/numa.h>
@@ -461,7 +462,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
switch (c->x86_model) {
case 0x00 ... 0x2f:
case 0x40 ... 0x4f:
- case 0x70 ... 0x7f:
+ case 0x60 ... 0x7f:
setup_force_cpu_cap(X86_FEATURE_ZEN5);
break;
default:
@@ -794,6 +795,11 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
clear_rdrand_cpuid_bit(c);
}
+static const struct x86_cpu_desc erratum_1386_microcode[] = {
+ AMD_CPU_DESC(0x17, 0x1, 0x2, 0x0800126e),
+ AMD_CPU_DESC(0x17, 0x31, 0x0, 0x08301052),
+};
+
static void fix_erratum_1386(struct cpuinfo_x86 *c)
{
/*
@@ -803,7 +809,13 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c)
*
* Affected parts all have no supervisor XSAVE states, meaning that
* the XSAVEC instruction (which works fine) is equivalent.
+ *
+ * Clear the feature flag only on microcode revisions which
+ * don't have the fix.
*/
+ if (x86_cpu_has_min_microcode_rev(erratum_1386_microcode))
+ return;
+
clear_cpu_cap(c, X86_FEATURE_XSAVES);
}
@@ -1178,22 +1190,6 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr)
}
EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
-u32 amd_get_highest_perf(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
- (c->x86_model >= 0x70 && c->x86_model < 0x80)))
- return 166;
-
- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
- (c->x86_model >= 0x40 && c->x86_model < 0x70)))
- return 166;
-
- return 255;
-}
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
-
static void zenbleed_check_cpu(void *unused)
{
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
@@ -1208,14 +1204,3 @@ void amd_check_microcode(void)
on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
-
-/*
- * Issue a DIV 0/1 insn to clear any division data from previous DIV
- * operations.
- */
-void noinstr amd_clear_divider(void)
-{
- asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0)
- :: "a" (0), "d" (0), "r" (1));
-}
-EXPORT_SYMBOL_GPL(amd_clear_divider);
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index fdbb5f07448f..f642de2ebdac 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -124,25 +124,24 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
return true;
}
-#define X86_MATCH(model) \
- X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
+#define X86_MATCH(vfm) \
+ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
- X86_MATCH(XEON_PHI_KNL),
- X86_MATCH(XEON_PHI_KNM),
+ X86_MATCH(INTEL_XEON_PHI_KNL),
+ X86_MATCH(INTEL_XEON_PHI_KNM),
{}
};
static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
- X86_MATCH(SKYLAKE_X),
+ X86_MATCH(INTEL_SKYLAKE_X),
{}
};
static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
- X86_MATCH(ATOM_GOLDMONT),
- X86_MATCH(ATOM_GOLDMONT_D),
- X86_MATCH(ATOM_GOLDMONT_PLUS),
+ X86_MATCH(INTEL_ATOM_GOLDMONT),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_D),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
{}
};
@@ -307,7 +306,7 @@ static void freq_invariance_enable(void)
WARN_ON_ONCE(1);
return;
}
- static_branch_enable(&arch_scale_freq_key);
+ static_branch_enable_cpuslocked(&arch_scale_freq_key);
register_freq_invariance_syscore_ops();
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
}
@@ -324,8 +323,10 @@ static void __init bp_init_freq_invariance(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return;
- if (intel_set_max_freq_ratio())
+ if (intel_set_max_freq_ratio()) {
+ guard(cpus_read_lock)();
freq_invariance_enable();
+ }
}
static void disable_freq_invariance_workfn(struct work_struct *work)
@@ -346,10 +347,91 @@ static DECLARE_WORK(disable_freq_invariance_work,
disable_freq_invariance_workfn);
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
+
+static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
+
+struct arch_hybrid_cpu_scale {
+ unsigned long capacity;
+ unsigned long freq_ratio;
+};
+
+static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
+
+/**
+ * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
+ *
+ * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
+ * initialize it and set the static key controlling its code paths.
+ *
+ * Must be called before arch_set_cpu_capacity().
+ */
+bool arch_enable_hybrid_capacity_scale(void)
+{
+ int cpu;
+
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
+ return true;
+ }
+
+ arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
+ if (!arch_cpu_scale)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
+ per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
+ }
+
+ static_branch_enable(&arch_hybrid_cap_scale_key);
+
+ pr_info("Hybrid CPU capacity scaling enabled\n");
+
+ return true;
+}
+
+/**
+ * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
+ * @cpu: Target CPU.
+ * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
+ * @max_cap: System-wide maximum CPU capacity.
+ * @cap_freq: Frequency of @cpu corresponding to @cap.
+ * @base_freq: Frequency of @cpu at which MPERF counts.
+ *
+ * The units in which @cap and @max_cap are expressed do not matter, so long
+ * as they are consistent, because the former is effectively divided by the
+ * latter. Analogously for @cap_freq and @base_freq.
+ *
+ * After calling this function for all CPUs, call arch_rebuild_sched_domains()
+ * to let the scheduler know that capacity-aware scheduling can be used going
+ * forward.
+ */
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+ unsigned long cap_freq, unsigned long base_freq)
+{
+ if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
+ div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
+ div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
+ } else {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
+ }
+}
+
+unsigned long arch_scale_cpu_capacity(int cpu)
+{
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
+
+ return SCHED_CAPACITY_SCALE;
+}
+EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
static void scale_freq_tick(u64 acnt, u64 mcnt)
{
- u64 freq_scale;
+ u64 freq_scale, freq_ratio;
if (!arch_scale_freq_invariant())
return;
@@ -357,7 +439,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt)
if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
goto error;
- if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
+ else
+ freq_ratio = arch_max_freq_ratio;
+
+ if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
goto error;
freq_scale = div64_u64(acnt, mcnt);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ab18185894df..d1915427b4ff 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -26,7 +26,7 @@
#include <asm/msr.h>
#include <asm/vmx.h>
#include <asm/paravirt.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/e820/api.h>
#include <asm/hypervisor.h>
#include <asm/tlbflush.h>
@@ -233,7 +233,8 @@ static void x86_amd_ssb_disable(void)
#define pr_fmt(fmt) "MDS: " fmt
/* Default mitigation for MDS-affected CPUs */
-static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
+static enum mds_mitigations mds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_FULL : MDS_MITIGATION_OFF;
static bool mds_nosmt __ro_after_init = false;
static const char * const mds_strings[] = {
@@ -293,7 +294,8 @@ enum taa_mitigations {
};
/* Default mitigation for TAA-affected CPUs */
-static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
+static enum taa_mitigations taa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_VERW : TAA_MITIGATION_OFF;
static bool taa_nosmt __ro_after_init;
static const char * const taa_strings[] = {
@@ -391,7 +393,8 @@ enum mmio_mitigations {
};
/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
-static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW;
+static enum mmio_mitigations mmio_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_VERW : MMIO_MITIGATION_OFF;
static bool mmio_nosmt __ro_after_init = false;
static const char * const mmio_strings[] = {
@@ -605,7 +608,8 @@ enum srbds_mitigations {
SRBDS_MITIGATION_HYPERVISOR,
};
-static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL;
+static enum srbds_mitigations srbds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF;
static const char * const srbds_strings[] = {
[SRBDS_MITIGATION_OFF] = "Vulnerable",
@@ -731,11 +735,8 @@ enum gds_mitigations {
GDS_MITIGATION_HYPERVISOR,
};
-#if IS_ENABLED(CONFIG_MITIGATION_GDS_FORCE)
-static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE;
-#else
-static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL;
-#endif
+static enum gds_mitigations gds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF;
static const char * const gds_strings[] = {
[GDS_MITIGATION_OFF] = "Vulnerable",
@@ -871,7 +872,8 @@ enum spectre_v1_mitigation {
};
static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init =
- SPECTRE_V1_MITIGATION_AUTO;
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V1) ?
+ SPECTRE_V1_MITIGATION_AUTO : SPECTRE_V1_MITIGATION_NONE;
static const char * const spectre_v1_strings[] = {
[SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers",
@@ -986,7 +988,7 @@ static const char * const retbleed_strings[] = {
static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
RETBLEED_MITIGATION_NONE;
static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init =
- RETBLEED_CMD_AUTO;
+ IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF;
static int __ro_after_init retbleed_nosmt = false;
@@ -1447,17 +1449,18 @@ static void __init spec_v2_print_cond(const char *reason, bool secure)
static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
{
- enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
+ enum spectre_v2_mitigation_cmd cmd;
char arg[20];
int ret, i;
+ cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
cpu_mitigations_off())
return SPECTRE_V2_CMD_NONE;
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
if (ret < 0)
- return SPECTRE_V2_CMD_AUTO;
+ return cmd;
for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
if (!match_option(arg, ret, mitigation_options[i].option))
@@ -1467,8 +1470,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
}
if (i >= ARRAY_SIZE(mitigation_options)) {
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
- return SPECTRE_V2_CMD_AUTO;
+ pr_err("unknown option (%s). Switching to default mode\n", arg);
+ return cmd;
}
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
@@ -1625,6 +1628,7 @@ static bool __init spec_ctrl_bhi_dis(void)
enum bhi_mitigations {
BHI_MITIGATION_OFF,
BHI_MITIGATION_ON,
+ BHI_MITIGATION_VMEXIT_ONLY,
};
static enum bhi_mitigations bhi_mitigation __ro_after_init =
@@ -1639,6 +1643,8 @@ static int __init spectre_bhi_parse_cmdline(char *str)
bhi_mitigation = BHI_MITIGATION_OFF;
else if (!strcmp(str, "on"))
bhi_mitigation = BHI_MITIGATION_ON;
+ else if (!strcmp(str, "vmexit"))
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
else
pr_err("Ignoring unknown spectre_bhi option (%s)", str);
@@ -1659,19 +1665,22 @@ static void __init bhi_select_mitigation(void)
return;
}
+ /* Mitigate in hardware if supported */
if (spec_ctrl_bhi_dis())
return;
if (!IS_ENABLED(CONFIG_X86_64))
return;
- /* Mitigate KVM by default */
- setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
- pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
+ if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
+ pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
+ return;
+ }
- /* Mitigate syscalls when the mitigation is forced =on */
+ pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n");
setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
- pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
}
static void __init spectre_v2_select_mitigation(void)
@@ -2015,10 +2024,12 @@ static const struct {
static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
{
- enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO;
+ enum ssb_mitigation_cmd cmd;
char arg[20];
int ret, i;
+ cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ?
+ SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE;
if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
cpu_mitigations_off()) {
return SPEC_STORE_BYPASS_CMD_NONE;
@@ -2026,7 +2037,7 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
arg, sizeof(arg));
if (ret < 0)
- return SPEC_STORE_BYPASS_CMD_AUTO;
+ return cmd;
for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) {
if (!match_option(arg, ret, ssb_mitigation_options[i].option))
@@ -2037,8 +2048,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
}
if (i >= ARRAY_SIZE(ssb_mitigation_options)) {
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
- return SPEC_STORE_BYPASS_CMD_AUTO;
+ pr_err("unknown option (%s). Switching to default mode\n", arg);
+ return cmd;
}
}
@@ -2365,7 +2376,8 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
#define pr_fmt(fmt) "L1TF: " fmt
/* Default mitigation for L1TF-affected CPUs */
-enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH;
+enum l1tf_mitigations l1tf_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF;
#if IS_ENABLED(CONFIG_KVM_INTEL)
EXPORT_SYMBOL_GPL(l1tf_mitigation);
#endif
@@ -2391,20 +2403,20 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
if (c->x86 != 6)
return;
- switch (c->x86_model) {
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_SANDYBRIDGE:
- case INTEL_FAM6_IVYBRIDGE:
- case INTEL_FAM6_HASWELL:
- case INTEL_FAM6_HASWELL_L:
- case INTEL_FAM6_HASWELL_G:
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_SKYLAKE_L:
- case INTEL_FAM6_SKYLAKE:
- case INTEL_FAM6_KABYLAKE_L:
- case INTEL_FAM6_KABYLAKE:
+ switch (c->x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_WESTMERE:
+ case INTEL_SANDYBRIDGE:
+ case INTEL_IVYBRIDGE:
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_G:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
if (c->x86_cache_bits < 44)
c->x86_cache_bits = 44;
break;
@@ -2545,10 +2557,9 @@ static void __init srso_select_mitigation(void)
{
bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
- if (cpu_mitigations_off())
- return;
-
- if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
+ if (!boot_cpu_has_bug(X86_BUG_SRSO) ||
+ cpu_mitigations_off() ||
+ srso_cmd == SRSO_CMD_OFF) {
if (boot_cpu_has(X86_FEATURE_SBPB))
x86_pred_cmd = PRED_CMD_SBPB;
return;
@@ -2579,11 +2590,6 @@ static void __init srso_select_mitigation(void)
}
switch (srso_cmd) {
- case SRSO_CMD_OFF:
- if (boot_cpu_has(X86_FEATURE_SBPB))
- x86_pred_cmd = PRED_CMD_SBPB;
- return;
-
case SRSO_CMD_MICROCODE:
if (has_microcode) {
srso_mitigation = SRSO_MITIGATION_MICROCODE;
@@ -2637,6 +2643,8 @@ static void __init srso_select_mitigation(void)
pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
}
break;
+ default:
+ break;
}
out:
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 605c26c009c8..07a34d723505 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -68,6 +68,7 @@
#include <asm/traps.h>
#include <asm/sev.h>
#include <asm/tdx.h>
+#include <asm/posted_intr.h>
#include "cpu.h"
@@ -114,17 +115,17 @@ static const struct x86_cpu_id ppin_cpuids[] = {
X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]),
/* Legacy models without CPUID enumeration */
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
{}
};
@@ -1053,18 +1054,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
void get_cpu_address_sizes(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
- bool vp_bits_from_cpuid = true;
if (!cpu_has(c, X86_FEATURE_CPUID) ||
- (c->extended_cpuid_level < 0x80000008))
- vp_bits_from_cpuid = false;
-
- if (vp_bits_from_cpuid) {
- cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
-
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- } else {
+ (c->extended_cpuid_level < 0x80000008)) {
if (IS_ENABLED(CONFIG_X86_64)) {
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -1078,7 +1070,17 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c)
cpu_has(c, X86_FEATURE_PSE36))
c->x86_phys_bits = 36;
}
+ } else {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+
+ /* Provide a sane default if not enumerated: */
+ if (!c->x86_clflush_size)
+ c->x86_clflush_size = 32;
}
+
c->x86_cache_bits = c->x86_phys_bits;
c->x86_cache_alignment = c->x86_clflush_size;
}
@@ -1125,8 +1127,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#define VULNWL(vendor, family, model, whitelist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
-#define VULNWL_INTEL(model, whitelist) \
- VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
+#define VULNWL_INTEL(vfm, whitelist) \
+ X86_MATCH_VFM(vfm, whitelist)
#define VULNWL_AMD(family, whitelist) \
VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
@@ -1143,32 +1145,32 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
- VULNWL_INTEL(TIGERLAKE, NO_MMIO),
- VULNWL_INTEL(TIGERLAKE_L, NO_MMIO),
- VULNWL_INTEL(ALDERLAKE, NO_MMIO),
- VULNWL_INTEL(ALDERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE_L, NO_MMIO),
- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(CORE_YONAH, NO_SSB),
+ VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB),
- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
/*
* Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1178,9 +1180,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
* good enough for our purposes.
*/
- VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB),
- VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB),
- VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_L, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
/* AMD Family 0xf - 0x12 */
VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
@@ -1201,10 +1203,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define VULNBL(vendor, family, model, blacklist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
-#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
- X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, steppings, \
- X86_FEATURE_ANY, issues)
+#define VULNBL_INTEL_STEPPINGS(vfm, steppings, issues) \
+ X86_MATCH_VFM_STEPPINGS(vfm, steppings, issues)
#define VULNBL_AMD(family, blacklist) \
VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
@@ -1229,43 +1229,43 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define RFDS BIT(7)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
- VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
- VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS),
- VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS),
- VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_HASWELL, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_L, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_X, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_X, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
+ VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE_L, X86_STEPPING_ANY, GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE, X86_STEPPING_ANY, GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(INTEL_ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
@@ -1510,6 +1510,11 @@ static void __init cpu_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "nousershstk"))
setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK);
+ /* Minimize the gap between FRED is available and available but disabled. */
+ arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg));
+ if (arglen != 2 || strncmp(arg, "on", 2))
+ setup_clear_cpu_cap(X86_FEATURE_FRED);
+
arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
if (arglen <= 0)
return;
@@ -1589,6 +1594,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
if (have_cpuid_p()) {
cpu_detect(c);
get_cpu_vendor(c);
+ intel_unlock_cpuid_leafs(c);
get_cpu_cap(c);
setup_force_cpu_cap(X86_FEATURE_CPUID);
get_cpu_address_sizes(c);
@@ -1748,7 +1754,7 @@ static void generic_identify(struct cpuinfo_x86 *c)
cpu_detect(c);
get_cpu_vendor(c);
-
+ intel_unlock_cpuid_leafs(c);
get_cpu_cap(c);
get_cpu_address_sizes(c);
@@ -2170,7 +2176,7 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss)
* Setup everything needed to handle exceptions from the IDT, including the IST
* exceptions which use paranoid_entry().
*/
-void cpu_init_exception_handling(void)
+void cpu_init_exception_handling(bool boot_cpu)
{
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
int cpu = raw_smp_processor_id();
@@ -2189,10 +2195,23 @@ void cpu_init_exception_handling(void)
/* GHCB needs to be setup to handle #VC. */
setup_ghcb();
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ /* The boot CPU has enabled FRED during early boot */
+ if (!boot_cpu)
+ cpu_init_fred_exceptions();
+
+ cpu_init_fred_rsps();
+ } else {
+ load_current_idt();
+ }
+}
+
+void __init cpu_init_replace_early_idt(void)
+{
if (cpu_feature_enabled(X86_FEATURE_FRED))
cpu_init_fred_exceptions();
else
- load_current_idt();
+ idt_setup_early_pf();
}
/*
@@ -2227,6 +2246,8 @@ void cpu_init(void)
barrier();
x2apic_setup();
+
+ intel_posted_msi_init();
}
mmgrab(&init_mm);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index ea9e07d57c8d..1beccefbaff9 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -61,9 +61,11 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
extern void __init tsx_init(void);
void tsx_ap_init(void);
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c);
#else
static inline void tsx_init(void) { }
static inline void tsx_ap_init(void) { }
+static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { }
#endif /* CONFIG_CPU_SUP_INTEL */
extern void init_spectral_chicken(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 946813d816bf..8bd84114c2d9 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -83,7 +83,6 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
{ X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
{ X86_FEATURE_FRED, X86_FEATURE_LKGS },
- { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS },
{}
};
@@ -114,6 +113,9 @@ static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
if (WARN_ON(feature >= MAX_FEATURE_BITS))
return;
+ if (boot_cpu_has(feature))
+ WARN_ON(alternatives_patched);
+
clear_feature(c, feature);
/* Collect all features to disable, handling dependencies */
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 1640ae76548f..4a4118784c13 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -188,7 +188,7 @@ update_caps:
update_sgx:
if (!(msr & FEAT_CTL_SGX_ENABLED)) {
if (enable_sgx_kvm || enable_sgx_driver)
- pr_err_once("SGX disabled by BIOS.\n");
+ pr_err_once("SGX disabled or unsupported by BIOS.\n");
clear_cpu_cap(c, X86_FEATURE_SGX);
return;
}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index be30d7fa2e66..e7656cbef68d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -72,19 +72,19 @@ static bool cpu_model_supports_sld __ro_after_init;
*/
static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
{
- switch (c->x86_model) {
- case INTEL_FAM6_CORE_YONAH:
- case INTEL_FAM6_CORE2_MEROM:
- case INTEL_FAM6_CORE2_MEROM_L:
- case INTEL_FAM6_CORE2_PENRYN:
- case INTEL_FAM6_CORE2_DUNNINGTON:
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_NEHALEM_G:
- case INTEL_FAM6_NEHALEM_EP:
- case INTEL_FAM6_NEHALEM_EX:
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_WESTMERE_EP:
- case INTEL_FAM6_SANDYBRIDGE:
+ switch (c->x86_vfm) {
+ case INTEL_CORE_YONAH:
+ case INTEL_CORE2_MEROM:
+ case INTEL_CORE2_MEROM_L:
+ case INTEL_CORE2_PENRYN:
+ case INTEL_CORE2_DUNNINGTON:
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_G:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_SANDYBRIDGE:
setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
}
}
@@ -106,9 +106,9 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
*/
if (c->x86 != 6)
return;
- switch (c->x86_model) {
- case INTEL_FAM6_XEON_PHI_KNL:
- case INTEL_FAM6_XEON_PHI_KNM:
+ switch (c->x86_vfm) {
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
break;
default:
return;
@@ -134,32 +134,32 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
* - Release note from 20180108 microcode release
*/
struct sku_microcode {
- u8 model;
+ u32 vfm;
u8 stepping;
u32 microcode;
};
static const struct sku_microcode spectre_bad_microcodes[] = {
- { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 },
- { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 },
- { INTEL_FAM6_KABYLAKE, 0x09, 0x80 },
- { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 },
- { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 },
- { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
- { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
- { INTEL_FAM6_BROADWELL, 0x04, 0x28 },
- { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b },
- { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 },
- { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 },
- { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
- { INTEL_FAM6_HASWELL_L, 0x01, 0x21 },
- { INTEL_FAM6_HASWELL_G, 0x01, 0x18 },
- { INTEL_FAM6_HASWELL, 0x03, 0x23 },
- { INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
- { INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
- { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
+ { INTEL_KABYLAKE, 0x0B, 0x80 },
+ { INTEL_KABYLAKE, 0x0A, 0x80 },
+ { INTEL_KABYLAKE, 0x09, 0x80 },
+ { INTEL_KABYLAKE_L, 0x0A, 0x80 },
+ { INTEL_KABYLAKE_L, 0x09, 0x80 },
+ { INTEL_SKYLAKE_X, 0x03, 0x0100013e },
+ { INTEL_SKYLAKE_X, 0x04, 0x0200003c },
+ { INTEL_BROADWELL, 0x04, 0x28 },
+ { INTEL_BROADWELL_G, 0x01, 0x1b },
+ { INTEL_BROADWELL_D, 0x02, 0x14 },
+ { INTEL_BROADWELL_D, 0x03, 0x07000011 },
+ { INTEL_BROADWELL_X, 0x01, 0x0b000025 },
+ { INTEL_HASWELL_L, 0x01, 0x21 },
+ { INTEL_HASWELL_G, 0x01, 0x18 },
+ { INTEL_HASWELL, 0x03, 0x23 },
+ { INTEL_HASWELL_X, 0x02, 0x3b },
+ { INTEL_HASWELL_X, 0x04, 0x10 },
+ { INTEL_IVYBRIDGE_X, 0x04, 0x42a },
/* Observed in the wild */
- { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
- { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
+ { INTEL_SANDYBRIDGE_X, 0x06, 0x61b },
+ { INTEL_SANDYBRIDGE_X, 0x07, 0x712 },
};
static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
@@ -173,11 +173,8 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_HYPERVISOR))
return false;
- if (c->x86 != 6)
- return false;
-
for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
- if (c->x86_model == spectre_bad_microcodes[i].model &&
+ if (c->x86_vfm == spectre_bad_microcodes[i].vfm &&
c->x86_stepping == spectre_bad_microcodes[i].stepping)
return (c->microcode <= spectre_bad_microcodes[i].microcode);
}
@@ -190,97 +187,57 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
-#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */
-#define TME_ACTIVATE_POLICY_AES_XTS_128 0
-
#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
-#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */
-#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1
-
-/* Values for mktme_status (SW only construct) */
-#define MKTME_ENABLED 0
-#define MKTME_DISABLED 1
-#define MKTME_UNINITIALIZED 2
-static int mktme_status = MKTME_UNINITIALIZED;
-
static void detect_tme_early(struct cpuinfo_x86 *c)
{
- u64 tme_activate, tme_policy, tme_crypto_algs;
- int keyid_bits = 0, nr_keyids = 0;
- static u64 tme_activate_cpu0 = 0;
+ u64 tme_activate;
+ int keyid_bits;
rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
- if (mktme_status != MKTME_UNINITIALIZED) {
- if (tme_activate != tme_activate_cpu0) {
- /* Broken BIOS? */
- pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
- pr_err_once("x86/tme: MKTME is not usable\n");
- mktme_status = MKTME_DISABLED;
-
- /* Proceed. We may need to exclude bits from x86_phys_bits. */
- }
- } else {
- tme_activate_cpu0 = tme_activate;
- }
-
if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
pr_info_once("x86/tme: not enabled by BIOS\n");
- mktme_status = MKTME_DISABLED;
+ clear_cpu_cap(c, X86_FEATURE_TME);
return;
}
+ pr_info_once("x86/tme: enabled by BIOS\n");
+ keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
+ if (!keyid_bits)
+ return;
- if (mktme_status != MKTME_UNINITIALIZED)
- goto detect_keyid_bits;
-
- pr_info("x86/tme: enabled by BIOS\n");
-
- tme_policy = TME_ACTIVATE_POLICY(tme_activate);
- if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
- pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
+ /*
+ * KeyID bits are set by BIOS and can be present regardless
+ * of whether the kernel is using them. They effectively lower
+ * the number of physical address bits.
+ *
+ * Update cpuinfo_x86::x86_phys_bits accordingly.
+ */
+ c->x86_phys_bits -= keyid_bits;
+ pr_info_once("x86/mktme: BIOS enabled: x86_phys_bits reduced by %d\n",
+ keyid_bits);
+}
- tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
- if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
- pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
- tme_crypto_algs);
- mktme_status = MKTME_DISABLED;
- }
-detect_keyid_bits:
- keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
- nr_keyids = (1UL << keyid_bits) - 1;
- if (nr_keyids) {
- pr_info_once("x86/mktme: enabled by BIOS\n");
- pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
- } else {
- pr_info_once("x86/mktme: disabled by BIOS\n");
- }
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
- if (mktme_status == MKTME_UNINITIALIZED) {
- /* MKTME is usable */
- mktme_status = MKTME_ENABLED;
- }
+ if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd))
+ return;
/*
- * KeyID bits effectively lower the number of physical address
- * bits. Update cpuinfo_x86::x86_phys_bits accordingly.
+ * The BIOS can have limited CPUID to leaf 2, which breaks feature
+ * enumeration. Unlock it and update the maximum leaf info.
*/
- c->x86_phys_bits -= keyid_bits;
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0)
+ c->cpuid_level = cpuid_eax(0);
}
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
- /* Unmask CPUID levels if masked: */
- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
- MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
- c->cpuid_level = cpuid_eax(0);
- get_cpu_cap(c);
- }
- }
-
if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
(c->x86 == 0x6 && c->x86_model >= 0x0e))
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
@@ -312,7 +269,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* need the microcode to have already been loaded... so if it is
* not, recommend a BIOS update and disable large pages.
*/
- if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 &&
+ if (c->x86_vfm == INTEL_ATOM_BONNELL && c->x86_stepping <= 2 &&
c->microcode < 0x20e) {
pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
clear_cpu_cap(c, X86_FEATURE_PSE);
@@ -344,30 +301,28 @@ static void early_init_intel(struct cpuinfo_x86 *c)
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
- if (c->x86 == 6) {
- switch (c->x86_model) {
- case INTEL_FAM6_ATOM_SALTWELL_MID:
- case INTEL_FAM6_ATOM_SALTWELL_TABLET:
- case INTEL_FAM6_ATOM_SILVERMONT_MID:
- case INTEL_FAM6_ATOM_AIRMONT_NP:
- set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
- break;
- default:
- break;
- }
+ switch (c->x86_vfm) {
+ case INTEL_ATOM_SALTWELL_MID:
+ case INTEL_ATOM_SALTWELL_TABLET:
+ case INTEL_ATOM_SILVERMONT_MID:
+ case INTEL_ATOM_AIRMONT_NP:
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
+ break;
}
/*
- * There is a known erratum on Pentium III and Core Solo
- * and Core Duo CPUs.
- * " Page with PAT set to WC while associated MTRR is UC
- * may consolidate to UC "
- * Because of this erratum, it is better to stick with
- * setting WC in MTRR rather than using PAT on these CPUs.
+ * PAT is broken on early family 6 CPUs, the last of which
+ * is "Yonah" where the erratum is named "AN7":
+ *
+ * Page with PAT (Page Attribute Table) Set to USWC
+ * (Uncacheable Speculative Write Combine) While
+ * Associated MTRR (Memory Type Range Register) Is UC
+ * (Uncacheable) May Consolidate to UC
*
- * Enable PAT WC only on P4, Core 2 or later CPUs.
+ * Disable PAT and fall back to MTRR on these CPUs.
*/
- if (c->x86 == 6 && c->x86_model < 15)
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO &&
+ c->x86_vfm <= INTEL_CORE_YONAH)
clear_cpu_cap(c, X86_FEATURE_PAT);
/*
@@ -393,7 +348,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE
* to be modified.
*/
- if (c->x86 == 5 && c->x86_model == 9) {
+ if (c->x86_vfm == INTEL_QUARK_X1000) {
pr_info("Disabling PGE capability bit\n");
setup_clear_cpu_cap(X86_FEATURE_PGE);
}
@@ -625,12 +580,13 @@ static void init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_PEBS);
}
- if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) &&
- (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
+ if (boot_cpu_has(X86_FEATURE_CLFLUSH) &&
+ (c->x86_vfm == INTEL_CORE2_DUNNINGTON ||
+ c->x86_vfm == INTEL_NEHALEM_EX ||
+ c->x86_vfm == INTEL_WESTMERE_EX))
set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
- if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) &&
- ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT)))
+ if (boot_cpu_has(X86_FEATURE_MWAIT) && c->x86_vfm == INTEL_ATOM_GOLDMONT)
set_cpu_bug(c, X86_BUG_MONITOR);
#ifdef CONFIG_X86_64
@@ -1246,9 +1202,9 @@ void handle_bus_lock(struct pt_regs *regs)
* feature even though they do not enumerate IA32_CORE_CAPABILITIES.
*/
static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
{}
};
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index f18d35fe27a9..30b1d63b97f3 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -204,12 +204,12 @@ static int intel_epb_offline(unsigned int cpu)
}
static const struct x86_cpu_id intel_epb_normal[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
{}
};
diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c
deleted file mode 100644
index 5be2b1790282..000000000000
--- a/arch/x86/kernel/cpu/intel_pconfig.c
+++ /dev/null
@@ -1,84 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Intel PCONFIG instruction support.
- *
- * Copyright (C) 2017 Intel Corporation
- *
- * Author:
- * Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
- */
-#include <linux/bug.h>
-#include <linux/limits.h>
-
-#include <asm/cpufeature.h>
-#include <asm/intel_pconfig.h>
-
-#define PCONFIG_CPUID 0x1b
-
-#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1)
-
-/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */
-enum {
- PCONFIG_CPUID_SUBLEAF_INVALID = 0,
- PCONFIG_CPUID_SUBLEAF_TARGETID = 1,
-};
-
-/* Bitmask of supported targets */
-static u64 targets_supported __read_mostly;
-
-int pconfig_target_supported(enum pconfig_target target)
-{
- /*
- * We would need to re-think the implementation once we get > 64
- * PCONFIG targets. Spec allows up to 2^32 targets.
- */
- BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64);
-
- if (WARN_ON_ONCE(target >= 64))
- return 0;
- return targets_supported & (1ULL << target);
-}
-
-static int __init intel_pconfig_init(void)
-{
- int subleaf;
-
- if (!boot_cpu_has(X86_FEATURE_PCONFIG))
- return 0;
-
- /*
- * Scan subleafs of PCONFIG CPUID leaf.
- *
- * Subleafs of the same type need not to be consecutive.
- *
- * Stop on the first invalid subleaf type. All subleafs after the first
- * invalid are invalid too.
- */
- for (subleaf = 0; subleaf < INT_MAX; subleaf++) {
- struct cpuid_regs regs;
-
- cpuid_count(PCONFIG_CPUID, subleaf,
- &regs.eax, &regs.ebx, &regs.ecx, &regs.edx);
-
- switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) {
- case PCONFIG_CPUID_SUBLEAF_INVALID:
- /* Stop on the first invalid subleaf */
- goto out;
- case PCONFIG_CPUID_SUBLEAF_TARGETID:
- /* Mark supported PCONFIG targets */
- if (regs.ebx < 64)
- targets_supported |= (1ULL << regs.ebx);
- if (regs.ecx < 64)
- targets_supported |= (1ULL << regs.ecx);
- if (regs.edx < 64)
- targets_supported |= (1ULL << regs.edx);
- break;
- default:
- /* Unknown CPUID.PCONFIG subleaf: ignore */
- break;
- }
- }
-out:
- return 0;
-}
-arch_initcall(intel_pconfig_init);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index ad6776081e60..8e7de733320a 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -17,8 +17,7 @@
*
* A typical table entry would be to match a specific CPU
*
- * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL,
- * X86_FEATURE_ANY, NULL);
+ * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL);
*
* Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
* %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
@@ -26,7 +25,7 @@
* asm/cpu_device_id.h contains a set of useful macros which are shortcuts
* for various common selections. The above can be shortened to:
*
- * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL);
+ * X86_MATCH_VFM(INTEL_BROADWELL, NULL);
*
* Arrays used to match for this should also be declared using
* MODULE_DEVICE_TABLE(x86cpu, ...)
@@ -39,9 +38,7 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
const struct x86_cpu_id *m;
struct cpuinfo_x86 *c = &boot_cpu_data;
- for (m = match;
- m->vendor | m->family | m->model | m->steppings | m->feature;
- m++) {
+ for (m = match; m->flags & X86_CPU_ID_FLAG_ENTRY_VALID; m++) {
if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
continue;
if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 9a0133ef7e20..14bf8c232e45 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -780,7 +780,7 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
struct mce m;
- mce_setup(&m);
+ mce_prep_record(&m);
m.status = status;
m.misc = misc;
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 7f7309ff67d0..3885fe05f01e 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -44,7 +44,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
else
lsb = PAGE_SHIFT;
- mce_setup(&m);
+ mce_prep_record(&m);
m.bank = -1;
/* Fake a memory read error with unknown channel */
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
@@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
{
const u64 *i_mce = ((const u64 *) (ctx_info + 1));
+ bool apicid_found = false;
unsigned int cpu;
struct mce m;
@@ -97,20 +98,19 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
if (ctx_info->reg_arr_size < 48)
return -EINVAL;
- mce_setup(&m);
-
- m.extcpu = -1;
- m.socketid = -1;
-
for_each_possible_cpu(cpu) {
if (cpu_data(cpu).topo.initial_apicid == lapic_id) {
- m.extcpu = cpu;
- m.socketid = cpu_data(m.extcpu).topo.pkg_id;
+ apicid_found = true;
break;
}
}
- m.apicid = lapic_id;
+ if (!apicid_found)
+ return -EINVAL;
+
+ mce_prep_record_common(&m);
+ mce_prep_record_per_cpu(cpu, &m);
+
m.bank = (ctx_info->msr_addr >> 4) & 0xFF;
m.status = *i_mce;
m.addr = *(i_mce + 1);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 84d41be6d06b..2a938f429c4d 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -47,7 +47,7 @@
#include <linux/kexec.h>
#include <asm/fred.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/tlbflush.h>
@@ -117,20 +117,32 @@ static struct irq_work mce_irq_work;
*/
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
-/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
+void mce_prep_record_common(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
- m->cpu = m->extcpu = smp_processor_id();
+
+ m->cpuid = cpuid_eax(1);
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
/* need the internal __ version to avoid deadlocks */
- m->time = __ktime_get_real_seconds();
- m->cpuvendor = boot_cpu_data.x86_vendor;
- m->cpuid = cpuid_eax(1);
- m->socketid = cpu_data(m->extcpu).topo.pkg_id;
- m->apicid = cpu_data(m->extcpu).topo.initial_apicid;
- m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
- m->ppin = cpu_data(m->extcpu).ppin;
- m->microcode = boot_cpu_data.microcode;
+ m->time = __ktime_get_real_seconds();
+}
+
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m)
+{
+ m->cpu = cpu;
+ m->extcpu = cpu;
+ m->apicid = cpu_data(cpu).topo.initial_apicid;
+ m->microcode = cpu_data(cpu).microcode;
+ m->ppin = topology_ppin(cpu);
+ m->socketid = topology_physical_package_id(cpu);
+}
+
+/* Do initial initialization of a struct mce */
+void mce_prep_record(struct mce *m)
+{
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(smp_processor_id(), m);
}
DEFINE_PER_CPU(struct mce, injectm);
@@ -436,11 +448,11 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
{
/*
- * Enable instrumentation around mce_setup() which calls external
+ * Enable instrumentation around mce_prep_record() which calls external
* facilities.
*/
instrumentation_begin();
- mce_setup(m);
+ mce_prep_record(m);
instrumentation_end();
m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
@@ -677,10 +689,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
* is already totally * confused. In this case it's likely it will
* not fully execute the machine check handler either.
*/
-bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
- bool error_seen = false;
struct mce m;
int i;
@@ -754,8 +765,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
continue;
log_it:
- error_seen = true;
-
if (flags & MCP_DONTLOG)
goto clear_it;
@@ -787,8 +796,6 @@ clear_it:
*/
sync_core();
-
- return error_seen;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
@@ -1593,6 +1600,24 @@ noinstr void do_machine_check(struct pt_regs *regs)
else
queue_task_work(&m, msg, kill_me_maybe);
+ } else if (m.mcgstatus & MCG_STATUS_SEAM_NR) {
+ /*
+ * Saved RIP on stack makes it look like the machine check
+ * was taken in the kernel on the instruction following
+ * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates
+ * that the machine check was taken inside SEAM non-root
+ * mode. CPU core has already marked that guest as dead.
+ * It is OK for the kernel to resume execution at the
+ * apparent point of the machine check as the fault did
+ * not occur there. Mark the page as poisoned so it won't
+ * be added to free list when the guest is terminated.
+ */
+ if (mce_usable_address(&m)) {
+ struct page *p = pfn_to_online_page(m.addr >> PAGE_SHIFT);
+
+ if (p)
+ SetPageHWPoison(p);
+ }
} else {
/*
* Handle an MCE which has happened in kernel space but from
@@ -1930,14 +1955,14 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
cfg->bootlog = 0;
- if (c->x86 == 6 && c->x86_model == 45)
+ if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
mce_flags.snb_ifu_quirk = 1;
/*
* Skylake, Cascacde Lake and Cooper Lake require a quirk on
* rep movs.
*/
- if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
+ if (c->x86_vfm == INTEL_SKYLAKE_X)
mce_flags.skx_repmov_quirk = 1;
}
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index a05ac0716ecf..af44fd5dbd7c 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -314,7 +314,7 @@ static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
/*
* Need to give user space some time to set everything up,
- * so do it a jiffie or two later everywhere.
+ * so do it a jiffy or two later everywhere.
*/
schedule_timeout(2);
@@ -331,7 +331,6 @@ static const struct file_operations mce_chrdev_ops = {
.poll = mce_chrdev_poll,
.unlocked_ioctl = mce_chrdev_ioctl,
.compat_ioctl = compat_ptr_ioctl,
- .llseek = no_llseek,
};
static struct miscdevice mce_chrdev_device = {
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
index fbe8b61c3413..4284749ec803 100644
--- a/arch/x86/kernel/cpu/mce/genpool.c
+++ b/arch/x86/kernel/cpu/mce/genpool.c
@@ -16,14 +16,14 @@
* used to save error information organized in a lock-less list.
*
* This memory pool is only to be used to save MCE records in MCE context.
- * MCE events are rare, so a fixed size memory pool should be enough. Use
- * 2 pages to save MCE events for now (~80 MCE records at most).
+ * MCE events are rare, so a fixed size memory pool should be enough.
+ * Allocate on a sliding scale based on number of CPUs.
*/
-#define MCE_POOLSZ (2 * PAGE_SIZE)
+#define MCE_MIN_ENTRIES 80
+#define MCE_PER_CPU 2
static struct gen_pool *mce_evt_pool;
static LLIST_HEAD(mce_event_llist);
-static char gen_pool_buf[MCE_POOLSZ];
/*
* Compare the record "t" with each of the records on list "l" to see if
@@ -118,22 +118,32 @@ int mce_gen_pool_add(struct mce *mce)
static int mce_gen_pool_create(void)
{
- struct gen_pool *tmpp;
+ int mce_numrecords, mce_poolsz, order;
+ struct gen_pool *gpool;
int ret = -ENOMEM;
-
- tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
- if (!tmpp)
- goto out;
-
- ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
+ void *mce_pool;
+
+ order = order_base_2(sizeof(struct mce_evt_llist));
+ gpool = gen_pool_create(order, -1);
+ if (!gpool)
+ return ret;
+
+ mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU);
+ mce_poolsz = mce_numrecords * (1 << order);
+ mce_pool = kmalloc(mce_poolsz, GFP_KERNEL);
+ if (!mce_pool) {
+ gen_pool_destroy(gpool);
+ return ret;
+ }
+ ret = gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1);
if (ret) {
- gen_pool_destroy(tmpp);
- goto out;
+ gen_pool_destroy(gpool);
+ kfree(mce_pool);
+ return ret;
}
- mce_evt_pool = tmpp;
+ mce_evt_pool = gpool;
-out:
return ret;
}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 94953d749475..49ed3428785d 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -487,12 +487,16 @@ static void prepare_msrs(void *info)
wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
}
- wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+
+ if (m.misc)
+ wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
} else {
wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
- wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
+
+ if (m.misc)
+ wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
}
}
@@ -795,4 +799,5 @@ static void __exit inject_exit(void)
module_init(inject_init);
module_exit(inject_exit);
+MODULE_DESCRIPTION("Machine check injection support");
MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 399b62e223d2..f6103e6bf69a 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -13,7 +13,7 @@
#include <linux/cpumask.h>
#include <asm/apic.h>
#include <asm/cpufeature.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -455,10 +455,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
{
u64 error_control;
- switch (c->x86_model) {
- case INTEL_FAM6_SANDYBRIDGE_X:
- case INTEL_FAM6_IVYBRIDGE_X:
- case INTEL_FAM6_HASWELL_X:
+ switch (c->x86_vfm) {
+ case INTEL_SANDYBRIDGE_X:
+ case INTEL_IVYBRIDGE_X:
+ case INTEL_HASWELL_X:
if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control))
return;
error_control |= 2;
@@ -484,12 +484,11 @@ bool intel_filter_mce(struct mce *m)
struct cpuinfo_x86 *c = &boot_cpu_data;
/* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
- if ((c->x86 == 6) &&
- ((c->x86_model == INTEL_FAM6_HASWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_L) ||
- (c->x86_model == INTEL_FAM6_BROADWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_G) ||
- (c->x86_model == INTEL_FAM6_SKYLAKE_X)) &&
+ if ((c->x86_vfm == INTEL_HASWELL ||
+ c->x86_vfm == INTEL_HASWELL_L ||
+ c->x86_vfm == INTEL_BROADWELL ||
+ c->x86_vfm == INTEL_HASWELL_G ||
+ c->x86_vfm == INTEL_SKYLAKE_X) &&
(m->bank == 0) &&
((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
return true;
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 01f8f03969e6..43c7f3b71df5 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -261,6 +261,8 @@ enum mca_msr {
/* Decide whether to add MCE record to MCE event pool or filter it out. */
extern bool filter_mce(struct mce *m);
+void mce_prep_record_common(struct mce *m);
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index c4477162c07d..dac4d64dfb2a 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -12,7 +12,7 @@
#include <linux/uaccess.h>
#include <asm/mce.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/traps.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
@@ -39,20 +39,20 @@ static struct severity {
u64 mask;
u64 result;
unsigned char sev;
- unsigned char mcgmask;
- unsigned char mcgres;
+ unsigned short mcgmask;
+ unsigned short mcgres;
unsigned char ser;
unsigned char context;
unsigned char excp;
unsigned char covered;
- unsigned char cpu_model;
+ unsigned int cpu_vfm;
unsigned char cpu_minstepping;
unsigned char bank_lo, bank_hi;
char *msg;
} severities[] = {
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h
-#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s
+#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s
#define KERNEL .context = IN_KERNEL
#define USER .context = IN_USER
#define KERNEL_RECOV .context = IN_KERNEL_RECOV
@@ -128,7 +128,7 @@ static struct severity {
MCESEV(
AO, "Uncorrected Patrol Scrub Error",
SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0),
- MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18)
+ VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18)
),
/* ignore OVER for UCNA */
@@ -174,6 +174,18 @@ static struct severity {
USER
),
MCESEV(
+ AR, "Data load error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
+ AR, "Instruction fetch error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
PANIC, "Data load in unrecoverable area of kernel",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
KERNEL
@@ -290,7 +302,6 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
switch (fixup_type) {
case EX_TYPE_UACCESS:
- case EX_TYPE_COPY:
if (!copy_user)
return IN_KERNEL;
m->kflags |= MCE_IN_KERNEL_COPYIN;
@@ -386,7 +397,7 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char
continue;
if (s->excp && excp != s->excp)
continue;
- if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model)
+ if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm)
continue;
if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping)
continue;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 13b45b9c806d..f63b051f25a0 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -84,13 +84,36 @@ struct microcode_amd {
unsigned int mpb[];
};
-#define PATCH_MAX_SIZE (3 * PAGE_SIZE)
-
static struct equiv_cpu_table {
unsigned int num_entries;
struct equiv_cpu_entry *entry;
} equiv_table;
+union zen_patch_rev {
+ struct {
+ __u32 rev : 8,
+ stepping : 4,
+ model : 4,
+ __reserved : 4,
+ ext_model : 4,
+ ext_fam : 8;
+ };
+ __u32 ucode_rev;
+};
+
+union cpuid_1_eax {
+ struct {
+ __u32 stepping : 4,
+ model : 4,
+ family : 4,
+ __reserved0 : 4,
+ ext_model : 4,
+ ext_fam : 8,
+ __reserved1 : 4;
+ };
+ __u32 full;
+};
+
/*
* This points to the current valid container of microcode patches which we will
* save from the initrd/builtin before jettisoning its contents. @mc is the
@@ -98,7 +121,6 @@ static struct equiv_cpu_table {
*/
struct cont_desc {
struct microcode_amd *mc;
- u32 cpuid_1_eax;
u32 psize;
u8 *data;
size_t size;
@@ -111,10 +133,42 @@ struct cont_desc {
static const char
ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
+/*
+ * This is CPUID(1).EAX on the BSP. It is used in two ways:
+ *
+ * 1. To ignore the equivalence table on Zen1 and newer.
+ *
+ * 2. To match which patches to load because the patch revision ID
+ * already contains the f/m/s for which the microcode is destined
+ * for.
+ */
+static u32 bsp_cpuid_1_eax __ro_after_init;
+
+static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val)
+{
+ union zen_patch_rev p;
+ union cpuid_1_eax c;
+
+ p.ucode_rev = val;
+ c.full = 0;
+
+ c.stepping = p.stepping;
+ c.model = p.model;
+ c.ext_model = p.ext_model;
+ c.family = 0xf;
+ c.ext_fam = p.ext_fam;
+
+ return c;
+}
+
static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig)
{
unsigned int i;
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return 0;
+
if (!et || !et->num_entries)
return 0;
@@ -161,6 +215,10 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
if (!verify_container(buf, buf_size))
return false;
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return true;
+
cont_type = hdr[1];
if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) {
pr_debug("Wrong microcode container equivalence table type: %u.\n",
@@ -224,8 +282,9 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
* exceed the per-family maximum). @sh_psize is the size read from the section
* header.
*/
-static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size)
+static unsigned int __verify_patch_size(u32 sh_psize, size_t buf_size)
{
+ u8 family = x86_family(bsp_cpuid_1_eax);
u32 max_size;
if (family >= 0x15)
@@ -260,9 +319,9 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size
* positive: patch is not for this family, skip it
* 0: success
*/
-static int
-verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
+static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
{
+ u8 family = x86_family(bsp_cpuid_1_eax);
struct microcode_header_amd *mc_hdr;
unsigned int ret;
u32 sh_psize;
@@ -288,7 +347,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
return -1;
}
- ret = __verify_patch_size(family, sh_psize, buf_size);
+ ret = __verify_patch_size(sh_psize, buf_size);
if (!ret) {
pr_debug("Per-family patch size mismatch.\n");
return -1;
@@ -310,6 +369,15 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
return 0;
}
+static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id)
+{
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return ucode_rev_to_cpuid(mc->hdr.patch_id).full == bsp_cpuid_1_eax;
+ else
+ return eq_id == mc->hdr.processor_rev_id;
+}
+
/*
* This scans the ucode blob for the proper container as we can have multiple
* containers glued together. Returns the equivalence ID from the equivalence
@@ -338,7 +406,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
* doesn't contain a patch for the CPU, scan through the whole container
* so that it can be skipped in case there are other containers appended.
*/
- eq_id = find_equiv_id(&table, desc->cpuid_1_eax);
+ eq_id = find_equiv_id(&table, bsp_cpuid_1_eax);
buf += hdr[2] + CONTAINER_HDR_SZ;
size -= hdr[2] + CONTAINER_HDR_SZ;
@@ -352,7 +420,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
u32 patch_size;
int ret;
- ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size);
+ ret = verify_patch(buf, size, &patch_size);
if (ret < 0) {
/*
* Patch verification failed, skip to the next container, if
@@ -365,7 +433,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
}
mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE);
- if (eq_id == mc->hdr.processor_rev_id) {
+ if (mc_patch_matches(mc, eq_id)) {
desc->psize = patch_size;
desc->mc = mc;
}
@@ -423,6 +491,7 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
/* verify patch application was successful */
native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
if (rev != mc->hdr.patch_id)
return -1;
@@ -440,14 +509,12 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
*
* Returns true if container found (sets @desc), false otherwise.
*/
-static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size)
+static bool early_apply_microcode(u32 old_rev, void *ucode, size_t size)
{
struct cont_desc desc = { 0 };
struct microcode_amd *mc;
bool ret = false;
- desc.cpuid_1_eax = cpuid_1_eax;
-
scan_containers(ucode, size, &desc);
mc = desc.mc;
@@ -465,9 +532,10 @@ static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, siz
return !__apply_microcode_amd(mc);
}
-static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ u8 family = x86_family(bsp_cpuid_1_eax);
struct firmware fw;
if (IS_ENABLED(CONFIG_X86_32))
@@ -486,11 +554,11 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
return false;
}
-static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static void __init find_blobs_in_containers(struct cpio_data *ret)
{
struct cpio_data cp;
- if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
+ if (!get_builtin_microcode(&cp))
cp = find_microcode_in_initrd(ucode_path);
*ret = cp;
@@ -501,16 +569,18 @@ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_
struct cpio_data cp = { };
u32 dummy;
+ bsp_cpuid_1_eax = cpuid_1_eax;
+
native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy);
/* Needed in load_microcode_amd() */
ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax;
- find_blobs_in_containers(cpuid_1_eax, &cp);
+ find_blobs_in_containers(&cp);
if (!(cp.data && cp.size))
return;
- if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size))
+ if (early_apply_microcode(ed->old_rev, cp.data, cp.size))
native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy);
}
@@ -527,12 +597,10 @@ static int __init save_microcode_in_initrd(void)
if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
return 0;
- find_blobs_in_containers(cpuid_1_eax, &cp);
+ find_blobs_in_containers(&cp);
if (!(cp.data && cp.size))
return -EINVAL;
- desc.cpuid_1_eax = cpuid_1_eax;
-
scan_containers(cp.data, cp.size, &desc);
if (!desc.mc)
return -EINVAL;
@@ -545,26 +613,65 @@ static int __init save_microcode_in_initrd(void)
}
early_initcall(save_microcode_in_initrd);
+static inline bool patch_cpus_equivalent(struct ucode_patch *p, struct ucode_patch *n)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id);
+ union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id);
+
+ /* Zap stepping */
+ p_cid.stepping = 0;
+ n_cid.stepping = 0;
+
+ return p_cid.full == n_cid.full;
+ } else {
+ return p->equiv_cpu == n->equiv_cpu;
+ }
+}
+
/*
* a small, trivial cache of per-family ucode patches
*/
-static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
+static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equiv_cpu)
{
struct ucode_patch *p;
+ struct ucode_patch n;
+
+ n.equiv_cpu = equiv_cpu;
+ n.patch_id = uci->cpu_sig.rev;
+
+ WARN_ON_ONCE(!n.patch_id);
list_for_each_entry(p, &microcode_cache, plist)
- if (p->equiv_cpu == equiv_cpu)
+ if (patch_cpus_equivalent(p, &n))
return p;
+
return NULL;
}
+static inline bool patch_newer(struct ucode_patch *p, struct ucode_patch *n)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union zen_patch_rev zp, zn;
+
+ zp.ucode_rev = p->patch_id;
+ zn.ucode_rev = n->patch_id;
+
+ return zn.rev > zp.rev;
+ } else {
+ return n->patch_id > p->patch_id;
+ }
+}
+
static void update_cache(struct ucode_patch *new_patch)
{
struct ucode_patch *p;
list_for_each_entry(p, &microcode_cache, plist) {
- if (p->equiv_cpu == new_patch->equiv_cpu) {
- if (p->patch_id >= new_patch->patch_id) {
+ if (patch_cpus_equivalent(p, new_patch)) {
+ if (!patch_newer(p, new_patch)) {
/* we already have the latest patch */
kfree(new_patch->data);
kfree(new_patch);
@@ -595,13 +702,22 @@ static void free_cache(void)
static struct ucode_patch *find_patch(unsigned int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- u16 equiv_id;
+ u32 rev, dummy __always_unused;
+ u16 equiv_id = 0;
- equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
- if (!equiv_id)
- return NULL;
+ /* fetch rev if not populated yet: */
+ if (!uci->cpu_sig.rev) {
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ uci->cpu_sig.rev = rev;
+ }
+
+ if (x86_family(bsp_cpuid_1_eax) < 0x17) {
+ equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
+ if (!equiv_id)
+ return NULL;
+ }
- return cache_find_patch(equiv_id);
+ return cache_find_patch(uci, equiv_id);
}
void reload_ucode_amd(unsigned int cpu)
@@ -651,7 +767,7 @@ static enum ucode_state apply_microcode_amd(int cpu)
struct ucode_cpu_info *uci;
struct ucode_patch *p;
enum ucode_state ret;
- u32 rev, dummy __always_unused;
+ u32 rev;
BUG_ON(raw_smp_processor_id() != cpu);
@@ -661,11 +777,11 @@ static enum ucode_state apply_microcode_amd(int cpu)
if (!p)
return UCODE_NFOUND;
+ rev = uci->cpu_sig.rev;
+
mc_amd = p->data;
uci->mc = p->data;
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
/* need to apply patch? */
if (rev > mc_amd->hdr.patch_id) {
ret = UCODE_OK;
@@ -711,6 +827,10 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
hdr = (const u32 *)buf;
equiv_tbl_len = hdr[2];
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ goto out;
+
equiv_table.entry = vmalloc(equiv_tbl_len);
if (!equiv_table.entry) {
pr_err("failed to allocate equivalent CPU table\n");
@@ -720,12 +840,16 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len);
equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry);
+out:
/* add header length */
return equiv_tbl_len + CONTAINER_HDR_SZ;
}
static void free_equiv_cpu_table(void)
{
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return;
+
vfree(equiv_table.entry);
memset(&equiv_table, 0, sizeof(equiv_table));
}
@@ -751,7 +875,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
u16 proc_id;
int ret;
- ret = verify_patch(family, fw, leftover, patch_size);
+ ret = verify_patch(fw, leftover, patch_size);
if (ret)
return ret;
@@ -776,7 +900,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
patch->patch_id = mc_hdr->patch_id;
patch->equiv_cpu = proc_id;
- pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
+ pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n",
__func__, patch->patch_id, proc_id);
/* ... and add to cache. */
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 232026a239a6..b3658d11e7b6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -60,11 +60,6 @@ module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
*/
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
-struct cpu_info_ctx {
- struct cpu_signature *cpu_sig;
- int err;
-};
-
/*
* Those patch levels cannot be updated to newer ones and thus should be final.
*/
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 5f0414452b67..815fa67356a2 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -21,7 +21,7 @@
#include <linux/uio.h>
#include <linux/mm.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
@@ -577,8 +577,7 @@ static bool is_blacklisted(unsigned int cpu)
* This behavior is documented in item BDF90, #334165 (Intel Xeon
* Processor E7-8800/4800 v4 Product Family).
*/
- if (c->x86 == 6 &&
- c->x86_model == INTEL_FAM6_BROADWELL_X &&
+ if (c->x86_vfm == INTEL_BROADWELL_X &&
c->x86_stepping == 0x01 &&
llc_size_per_core > 2621440 &&
c->microcode < 0x0b000021) {
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 1db560ed2ca3..68f537347466 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -30,8 +30,7 @@ dump_array()
# If the /* comment */ starts with a quote string, grab that.
VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
- [ -z "$VALUE" ] && VALUE="\"$NAME\""
- [ "$VALUE" = '""' ] && continue
+ [ ! "$VALUE" ] && continue
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e0fd57a8ba84..d18078834ded 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -16,7 +16,6 @@
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kexec.h>
-#include <linux/i8253.h>
#include <linux/random.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
@@ -199,8 +198,8 @@ static void hv_machine_shutdown(void)
* Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor
* corrupts the old VP Assist Pages and can crash the kexec kernel.
*/
- if (kexec_in_progress && hyperv_init_cpuhp > 0)
- cpuhp_remove_state(hyperv_init_cpuhp);
+ if (kexec_in_progress)
+ cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE);
/* The function calls stop_other_cpus(). */
native_machine_shutdown();
@@ -424,6 +423,7 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
x86_platform.calibrate_tsc = hv_get_tsc_khz;
x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
}
if (ms_hyperv.priv_high & HV_ISOLATION) {
@@ -449,9 +449,23 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
if (!ms_hyperv.paravisor_present) {
- /* To be supported: more work is required. */
+ /*
+ * Mark the Hyper-V TSC page feature as disabled
+ * in a TDX VM without paravisor so that the
+ * Invariant TSC, which is a better clocksource
+ * anyway, is used instead.
+ */
ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+ /*
+ * The Invariant TSC is expected to be available
+ * in a TDX VM without paravisor, but if not,
+ * print a warning message. The slower Hyper-V MSR-based
+ * Ref Counter should end up being the clocksource.
+ */
+ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ pr_warn("Hyper-V: Invariant TSC is unavailable\n");
+
/* HV_MSR_CRASH_CTL is unsupported. */
ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
@@ -522,16 +536,6 @@ static void __init ms_hyperv_init_platform(void)
if (efi_enabled(EFI_BOOT))
x86_platform.get_nmi_reason = hv_get_nmi_reason;
- /*
- * Hyper-V VMs have a PIT emulation quirk such that zeroing the
- * counter register during PIT shutdown restarts the PIT. So it
- * continues to interrupt @18.2 HZ. Setting i8253_clear_counter
- * to false tells pit_shutdown() not to zero the counter so that
- * the PIT really is shutdown. Generation 2 VMs don't have a PIT,
- * and setting this value has no effect.
- */
- i8253_clear_counter_on_shutdown = false;
-
#if IS_ENABLED(CONFIG_HYPERV)
if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
ms_hyperv.paravisor_present)
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 767bf1c71aad..989d368be04f 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -55,6 +55,12 @@
#include "mtrr.h"
+static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE);
+static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB);
+static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH);
+static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT);
+static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK);
+
/* arch_phys_wc_add returns an MTRR register index plus this offset. */
#define MTRR_TO_PHYS_WC_OFFSET 1000
@@ -609,7 +615,7 @@ void mtrr_save_state(void)
{
int first_cpu;
- if (!mtrr_enabled())
+ if (!mtrr_enabled() || !mtrr_state.have_fixed)
return;
first_cpu = cpumask_first(cpu_online_mask);
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 83e40341583e..8591d53c144b 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -19,10 +19,9 @@
#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/err.h>
-#include <linux/cacheinfo.h>
#include <linux/cpuhotplug.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/resctrl.h>
#include "internal.h"
@@ -56,16 +55,12 @@ int max_name_width, max_data_width;
*/
bool rdt_alloc_capable;
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
+static void mba_wrmsr_intel(struct msr_param *m);
+static void cat_wrmsr(struct msr_param *m);
+static void mba_wrmsr_amd(struct msr_param *m);
-#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains)
+#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
+#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
struct rdt_hw_resource rdt_resources_all[] = {
[RDT_RESOURCE_L3] =
@@ -73,8 +68,10 @@ struct rdt_hw_resource rdt_resources_all[] = {
.r_resctrl = {
.rid = RDT_RESOURCE_L3,
.name = "L3",
- .cache_level = 3,
- .domains = domain_init(RDT_RESOURCE_L3),
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .mon_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3),
+ .mon_domains = mon_domain_init(RDT_RESOURCE_L3),
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
.fflags = RFTYPE_RES_CACHE,
@@ -87,8 +84,8 @@ struct rdt_hw_resource rdt_resources_all[] = {
.r_resctrl = {
.rid = RDT_RESOURCE_L2,
.name = "L2",
- .cache_level = 2,
- .domains = domain_init(RDT_RESOURCE_L2),
+ .ctrl_scope = RESCTRL_L2_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2),
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
.fflags = RFTYPE_RES_CACHE,
@@ -101,8 +98,8 @@ struct rdt_hw_resource rdt_resources_all[] = {
.r_resctrl = {
.rid = RDT_RESOURCE_MBA,
.name = "MB",
- .cache_level = 3,
- .domains = domain_init(RDT_RESOURCE_MBA),
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA),
.parse_ctrlval = parse_bw,
.format_str = "%d=%*u",
.fflags = RFTYPE_RES_MB,
@@ -113,8 +110,8 @@ struct rdt_hw_resource rdt_resources_all[] = {
.r_resctrl = {
.rid = RDT_RESOURCE_SMBA,
.name = "SMBA",
- .cache_level = 3,
- .domains = domain_init(RDT_RESOURCE_SMBA),
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA),
.parse_ctrlval = parse_bw,
.format_str = "%d=%*u",
.fflags = RFTYPE_RES_MB,
@@ -122,6 +119,14 @@ struct rdt_hw_resource rdt_resources_all[] = {
},
};
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
+ return r->num_rmid;
+}
+
/*
* cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
* as they do not have CPUID enumeration support for Cache allocation.
@@ -309,12 +314,11 @@ static void rdt_get_cdp_l2_config(void)
rdt_get_cdp_config(RDT_RESOURCE_L2);
}
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void mba_wrmsr_amd(struct msr_param *m)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
@@ -334,37 +338,51 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
return r->default_ctrl;
}
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r)
+static void mba_wrmsr_intel(struct msr_param *m)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
/* Write the delay values for mba. */
for (i = m->low; i < m->high; i++)
- wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r));
+ wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
}
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void cat_wrmsr(struct msr_param *m)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
}
-struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
+struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r)
+{
+ struct rdt_ctrl_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
+struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r)
{
- struct rdt_domain *d;
+ struct rdt_mon_domain *d;
- list_for_each_entry(d, &r->domains, list) {
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
/* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->cpu_mask))
+ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
return d;
}
@@ -378,40 +396,29 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
void rdt_ctrl_update(void *arg)
{
+ struct rdt_hw_resource *hw_res;
struct msr_param *m = arg;
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
- struct rdt_resource *r = m->res;
- int cpu = smp_processor_id();
- struct rdt_domain *d;
- d = get_domain_from_cpu(cpu, r);
- if (d) {
- hw_res->msr_update(d, m, r);
- return;
- }
- pr_warn_once("cpu %d not found in any domain for resource %s\n",
- cpu, r->name);
+ hw_res = resctrl_to_arch_res(m->res);
+ hw_res->msr_update(m);
}
/*
- * rdt_find_domain - Find a domain in a resource that matches input resource id
+ * rdt_find_domain - Search for a domain id in a resource domain list.
*
- * Search resource r's domain list to find the resource id. If the resource
- * id is found in a domain, return the domain. Otherwise, if requested by
- * caller, return the first domain whose id is bigger than the input id.
- * The domain list is sorted by id in ascending order.
+ * Search the domain list to find the domain id. If the domain id is
+ * found, return the domain. NULL otherwise. If the domain id is not
+ * found (and NULL returned) then the first domain with id bigger than
+ * the input id can be returned to the caller via @pos.
*/
-struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
- struct list_head **pos)
+struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id,
+ struct list_head **pos)
{
- struct rdt_domain *d;
+ struct rdt_domain_hdr *d;
struct list_head *l;
- if (id < 0)
- return ERR_PTR(-ENODEV);
-
- list_for_each(l, &r->domains) {
- d = list_entry(l, struct rdt_domain, list);
+ list_for_each(l, h) {
+ d = list_entry(l, struct rdt_domain_hdr, list);
/* When id is found, return its domain. */
if (id == d->id)
return d;
@@ -440,18 +447,23 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
*dc = r->default_ctrl;
}
-static void domain_free(struct rdt_hw_domain *hw_dom)
+static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
+{
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom);
+}
+
+static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom)
{
kfree(hw_dom->arch_mbm_total);
kfree(hw_dom->arch_mbm_local);
- kfree(hw_dom->ctrl_val);
kfree(hw_dom);
}
-static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
+static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
struct msr_param m;
u32 *dc;
@@ -463,9 +475,11 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
hw_dom->ctrl_val = dc;
setup_default_ctrlval(r, dc);
+ m.res = r;
+ m.dom = d;
m.low = 0;
m.high = hw_res->num_closid;
- hw_res->msr_update(d, &m, r);
+ hw_res->msr_update(&m);
return 0;
}
@@ -474,7 +488,7 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
* @num_rmid: The size of the MBM counter array
* @hw_dom: The domain that owns the allocated arrays
*/
-static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
+static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
{
size_t tsize;
@@ -497,37 +511,45 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
return 0;
}
-/*
- * domain_add_cpu - Add a cpu to a resource's domain list.
- *
- * If an existing domain in the resource r's domain list matches the cpu's
- * resource id, add the cpu in the domain.
- *
- * Otherwise, a new domain is allocated and inserted into the right position
- * in the domain list sorted by id in ascending order.
- *
- * The order in the domain list is visible to users when we print entries
- * in the schemata file and schemata input is validated to have the same order
- * as this list.
- */
-static void domain_add_cpu(int cpu, struct rdt_resource *r)
+static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
{
- int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
+ switch (scope) {
+ case RESCTRL_L2_CACHE:
+ case RESCTRL_L3_CACHE:
+ return get_cpu_cacheinfo_id(cpu, scope);
+ case RESCTRL_L3_NODE:
+ return cpu_to_node(cpu);
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+
+static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
struct list_head *add_pos = NULL;
- struct rdt_hw_domain *hw_dom;
- struct rdt_domain *d;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
int err;
lockdep_assert_held(&domain_list_lock);
- d = rdt_find_domain(r, id, &add_pos);
- if (IS_ERR(d)) {
- pr_warn("Couldn't find cache id for CPU %d\n", cpu);
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
return;
}
- if (d) {
- cpumask_set_cpu(cpu, &d->cpu_mask);
+ hdr = rdt_find_domain(&r->ctrl_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
if (r->cache.arch_has_per_cpu_cfg)
rdt_domain_reconfigure_cdp(r);
return;
@@ -538,62 +560,185 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
d = &hw_dom->d_resctrl;
- d->id = id;
- cpumask_set_cpu(cpu, &d->cpu_mask);
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_CTRL_DOMAIN;
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
rdt_domain_reconfigure_cdp(r);
- if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
- domain_free(hw_dom);
+ if (domain_setup_ctrlval(r, d)) {
+ ctrl_domain_free(hw_dom);
+ return;
+ }
+
+ list_add_tail_rcu(&d->hdr.list, add_pos);
+
+ err = resctrl_online_ctrl_domain(r, d);
+ if (err) {
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ ctrl_domain_free(hw_dom);
+ }
+}
+
+static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct list_head *add_pos = NULL;
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+ int err;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
+ return;
+ }
+
+ hdr = rdt_find_domain(&r->mon_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+ return;
+ }
+
+ hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!hw_dom)
+ return;
+
+ d = &hw_dom->d_resctrl;
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_MON_DOMAIN;
+ d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!d->ci) {
+ pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
+ mon_domain_free(hw_dom);
return;
}
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
- if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
- domain_free(hw_dom);
+ arch_mon_domain_online(r, d);
+
+ if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
+ mon_domain_free(hw_dom);
return;
}
- list_add_tail_rcu(&d->list, add_pos);
+ list_add_tail_rcu(&d->hdr.list, add_pos);
- err = resctrl_online_domain(r, d);
+ err = resctrl_online_mon_domain(r, d);
if (err) {
- list_del_rcu(&d->list);
+ list_del_rcu(&d->hdr.list);
synchronize_rcu();
- domain_free(hw_dom);
+ mon_domain_free(hw_dom);
}
}
-static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
{
- int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
- struct rdt_hw_domain *hw_dom;
- struct rdt_domain *d;
+ if (r->alloc_capable)
+ domain_add_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_add_cpu_mon(cpu, r);
+}
+
+static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
lockdep_assert_held(&domain_list_lock);
- d = rdt_find_domain(r, id, NULL);
- if (IS_ERR_OR_NULL(d)) {
- pr_warn("Couldn't find cache id for CPU %d\n", cpu);
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
+ return;
+ }
+
+ hdr = rdt_find_domain(&r->ctrl_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
return;
}
- hw_dom = resctrl_to_arch_dom(d);
- cpumask_clear_cpu(cpu, &d->cpu_mask);
- if (cpumask_empty(&d->cpu_mask)) {
- resctrl_offline_domain(r, d);
- list_del_rcu(&d->list);
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_ctrl_domain(r, d);
+ list_del_rcu(&d->hdr.list);
synchronize_rcu();
/*
- * rdt_domain "d" is going to be freed below, so clear
+ * rdt_ctrl_domain "d" is going to be freed below, so clear
* its pointer from pseudo_lock_region struct.
*/
if (d->plr)
d->plr->d = NULL;
- domain_free(hw_dom);
+ ctrl_domain_free(hw_dom);
+
+ return;
+ }
+}
+
+static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
+ return;
+ }
+ hdr = rdt_find_domain(&r->mon_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
return;
}
+
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+ hw_dom = resctrl_to_arch_mon_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_mon_domain(r, d);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ mon_domain_free(hw_dom);
+
+ return;
+ }
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+ if (r->alloc_capable)
+ domain_remove_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_remove_cpu_mon(cpu, r);
}
static void clear_closid_rmid(int cpu)
@@ -821,18 +966,18 @@ static __init bool get_rdt_mon_resources(void)
static __init void __check_quirks_intel(void)
{
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_HASWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_HASWELL_X:
if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
cache_alloc_hsw_probe();
break;
- case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_SKYLAKE_X:
if (boot_cpu_data.x86_stepping <= 4)
set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
else
set_rdt_options("!l3cat");
fallthrough;
- case INTEL_FAM6_BROADWELL_X:
+ case INTEL_BROADWELL_X:
intel_rdt_mbm_apply_quirk();
break;
}
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 7997b47743a2..50fa1fe9a073 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -60,7 +60,7 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
}
int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d)
+ struct rdt_ctrl_domain *d)
{
struct resctrl_staged_config *cfg;
u32 closid = data->rdtgrp->closid;
@@ -69,7 +69,7 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
cfg = &d->staged_config[s->conf_type];
if (cfg->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
+ rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
return -EINVAL;
}
@@ -139,7 +139,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
* resource type.
*/
int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d)
+ struct rdt_ctrl_domain *d)
{
struct rdtgroup *rdtgrp = data->rdtgrp;
struct resctrl_staged_config *cfg;
@@ -148,7 +148,7 @@ int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
cfg = &d->staged_config[s->conf_type];
if (cfg->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
+ rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
return -EINVAL;
}
@@ -208,8 +208,8 @@ static int parse_line(char *line, struct resctrl_schema *s,
struct resctrl_staged_config *cfg;
struct rdt_resource *r = s->res;
struct rdt_parse_data data;
+ struct rdt_ctrl_domain *d;
char *dom = NULL, *id;
- struct rdt_domain *d;
unsigned long dom_id;
/* Walking r->domains, ensure it can't race with cpuhp */
@@ -231,8 +231,8 @@ next:
return -EINVAL;
}
dom = strim(dom);
- list_for_each_entry(d, &r->domains, list) {
- if (d->id == dom_id) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ if (d->hdr.id == dom_id) {
data.buf = dom;
data.rdtgrp = rdtgrp;
if (r->parse_ctrlval(&data, s, d))
@@ -272,39 +272,24 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
}
}
-static bool apply_config(struct rdt_hw_domain *hw_dom,
- struct resctrl_staged_config *cfg, u32 idx,
- cpumask_var_t cpu_mask)
-{
- struct rdt_domain *dom = &hw_dom->d_resctrl;
-
- if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) {
- cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
- hw_dom->ctrl_val[idx] = cfg->new_ctrl;
-
- return true;
- }
-
- return false;
-}
-
-int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type t, u32 cfg_val)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
u32 idx = get_config_index(closid, t);
struct msr_param msr_param;
- if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
+ if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask))
return -EINVAL;
hw_dom->ctrl_val[idx] = cfg_val;
msr_param.res = r;
+ msr_param.dom = d;
msr_param.low = idx;
msr_param.high = idx + 1;
- hw_res->msr_update(d, &msr_param, r);
+ hw_res->msr_update(&msr_param);
return 0;
}
@@ -312,51 +297,42 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
{
struct resctrl_staged_config *cfg;
- struct rdt_hw_domain *hw_dom;
+ struct rdt_hw_ctrl_domain *hw_dom;
struct msr_param msr_param;
+ struct rdt_ctrl_domain *d;
enum resctrl_conf_type t;
- cpumask_var_t cpu_mask;
- struct rdt_domain *d;
u32 idx;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
-
- msr_param.res = NULL;
- list_for_each_entry(d, &r->domains, list) {
- hw_dom = resctrl_to_arch_dom(d);
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+ msr_param.res = NULL;
for (t = 0; t < CDP_NUM_TYPES; t++) {
cfg = &hw_dom->d_resctrl.staged_config[t];
if (!cfg->have_new_ctrl)
continue;
idx = get_config_index(closid, t);
- if (!apply_config(hw_dom, cfg, idx, cpu_mask))
+ if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
continue;
+ hw_dom->ctrl_val[idx] = cfg->new_ctrl;
if (!msr_param.res) {
msr_param.low = idx;
msr_param.high = msr_param.low + 1;
msr_param.res = r;
+ msr_param.dom = d;
} else {
msr_param.low = min(msr_param.low, idx);
msr_param.high = max(msr_param.high, idx + 1);
}
}
+ if (msr_param.res)
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- if (cpumask_empty(cpu_mask))
- goto done;
-
- /* Update resource control msr on all the CPUs. */
- on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-
-done:
- free_cpumask_var(cpu_mask);
-
return 0;
}
@@ -454,10 +430,10 @@ out:
return ret ?: nbytes;
}
-u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type type)
{
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
u32 idx = get_config_index(closid, type);
return hw_dom->ctrl_val[idx];
@@ -466,7 +442,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
{
struct rdt_resource *r = schema->res;
- struct rdt_domain *dom;
+ struct rdt_ctrl_domain *dom;
bool sep = false;
u32 ctrl_val;
@@ -474,7 +450,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
lockdep_assert_cpus_held();
seq_printf(s, "%*s:", max_name_width, schema->name);
- list_for_each_entry(dom, &r->domains, list) {
+ list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
if (sep)
seq_puts(s, ";");
@@ -484,7 +460,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
ctrl_val = resctrl_arch_get_config(r, dom, closid,
schema->conf_type);
- seq_printf(s, r->format_str, dom->id, max_data_width,
+ seq_printf(s, r->format_str, dom->hdr.id, max_data_width,
ctrl_val);
sep = true;
}
@@ -513,7 +489,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
} else {
seq_printf(s, "%s:%d=%x\n",
rdtgrp->plr->s->res->name,
- rdtgrp->plr->d->id,
+ rdtgrp->plr->d->hdr.id,
rdtgrp->plr->cbm);
}
} else {
@@ -538,8 +514,8 @@ static int smp_mon_event_count(void *arg)
}
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
- struct rdt_domain *d, struct rdtgroup *rdtgrp,
- int evtid, int first)
+ struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+ cpumask_t *cpumask, int evtid, int first)
{
int cpu;
@@ -553,7 +529,6 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
rr->evtid = evtid;
rr->r = r;
rr->d = d;
- rr->val = 0;
rr->first = first;
rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
if (IS_ERR(rr->arch_mon_ctx)) {
@@ -561,7 +536,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
return;
}
- cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU);
+ cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU);
/*
* cpumask_any_housekeeping() prefers housekeeping CPUs, but
@@ -570,7 +545,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
* counters on some platforms if its called in IRQ context.
*/
if (tick_nohz_full_cpu(cpu))
- smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
+ smp_call_function_any(cpumask, mon_event_count, rr, 1);
else
smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
@@ -580,12 +555,13 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
int rdtgroup_mondata_show(struct seq_file *m, void *arg)
{
struct kernfs_open_file *of = m->private;
+ struct rdt_domain_hdr *hdr;
+ struct rmid_read rr = {0};
+ struct rdt_mon_domain *d;
u32 resid, evtid, domid;
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
union mon_data_bits md;
- struct rdt_domain *d;
- struct rmid_read rr;
int ret = 0;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
@@ -598,15 +574,40 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
resid = md.u.rid;
domid = md.u.domid;
evtid = md.u.evtid;
-
r = &rdt_resources_all[resid].r_resctrl;
- d = rdt_find_domain(r, domid, NULL);
- if (IS_ERR_OR_NULL(d)) {
+
+ if (md.u.sum) {
+ /*
+ * This file requires summing across all domains that share
+ * the L3 cache id that was provided in the "domid" field of the
+ * mon_data_bits union. Search all domains in the resource for
+ * one that matches this cache id.
+ */
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ if (d->ci->id == domid) {
+ rr.ci = d->ci;
+ mon_event_read(&rr, r, NULL, rdtgrp,
+ &d->ci->shared_cpu_map, evtid, false);
+ goto checkresult;
+ }
+ }
ret = -ENOENT;
goto out;
+ } else {
+ /*
+ * This file provides data from a single domain. Search
+ * the resource to find the domain with "domid".
+ */
+ hdr = rdt_find_domain(&r->mon_domains, domid, NULL);
+ if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+ mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false);
}
- mon_event_read(&rr, r, d, rdtgrp, evtid, false);
+checkresult:
if (rr.err == -EIO)
seq_puts(m, "Error\n");
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 1a8687f8073a..955999aecfca 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -127,29 +127,54 @@ struct mon_evt {
};
/**
- * union mon_data_bits - Monitoring details for each event file
+ * union mon_data_bits - Monitoring details for each event file.
* @priv: Used to store monitoring event data in @u
- * as kernfs private data
- * @rid: Resource id associated with the event file
- * @evtid: Event id associated with the event file
- * @domid: The domain to which the event file belongs
- * @u: Name of the bit fields struct
+ * as kernfs private data.
+ * @u.rid: Resource id associated with the event file.
+ * @u.evtid: Event id associated with the event file.
+ * @u.sum: Set when event must be summed across multiple
+ * domains.
+ * @u.domid: When @u.sum is zero this is the domain to which
+ * the event file belongs. When @sum is one this
+ * is the id of the L3 cache that all domains to be
+ * summed share.
+ * @u: Name of the bit fields struct.
*/
union mon_data_bits {
void *priv;
struct {
unsigned int rid : 10;
- enum resctrl_event_id evtid : 8;
+ enum resctrl_event_id evtid : 7;
+ unsigned int sum : 1;
unsigned int domid : 14;
} u;
};
+/**
+ * struct rmid_read - Data passed across smp_call*() to read event count.
+ * @rgrp: Resource group for which the counter is being read. If it is a parent
+ * resource group then its event count is summed with the count from all
+ * its child resource groups.
+ * @r: Resource describing the properties of the event being read.
+ * @d: Domain that the counter should be read from. If NULL then sum all
+ * domains in @r sharing L3 @ci.id
+ * @evtid: Which monitor event to read.
+ * @first: Initialize MBM counter when true.
+ * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
+ * @err: Error encountered when reading counter.
+ * @val: Returned value of event counter. If @rgrp is a parent resource group,
+ * @val includes the sum of event counts from its child resource groups.
+ * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id,
+ * (summed across child resource groups if @rgrp is a parent resource group).
+ * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only).
+ */
struct rmid_read {
struct rdtgroup *rgrp;
struct rdt_resource *r;
- struct rdt_domain *d;
+ struct rdt_mon_domain *d;
enum resctrl_event_id evtid;
bool first;
+ struct cacheinfo *ci;
int err;
u64 val;
void *arch_mon_ctx;
@@ -232,7 +257,7 @@ struct mongroup {
*/
struct pseudo_lock_region {
struct resctrl_schema *s;
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
u32 cbm;
wait_queue_head_t lock_thread_wq;
int thread_done;
@@ -355,35 +380,53 @@ struct arch_mbm_state {
};
/**
- * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share
- * a resource
+ * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a control function
* @d_resctrl: Properties exposed to the resctrl file system
* @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
+ */
+struct rdt_hw_ctrl_domain {
+ struct rdt_ctrl_domain d_resctrl;
+ u32 *ctrl_val;
+};
+
+/**
+ * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a monitor function
+ * @d_resctrl: Properties exposed to the resctrl file system
* @arch_mbm_total: arch private state for MBM total bandwidth
* @arch_mbm_local: arch private state for MBM local bandwidth
*
* Members of this structure are accessed via helpers that provide abstraction.
*/
-struct rdt_hw_domain {
- struct rdt_domain d_resctrl;
- u32 *ctrl_val;
+struct rdt_hw_mon_domain {
+ struct rdt_mon_domain d_resctrl;
struct arch_mbm_state *arch_mbm_total;
struct arch_mbm_state *arch_mbm_local;
};
-static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
+static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r)
{
- return container_of(r, struct rdt_hw_domain, d_resctrl);
+ return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl);
+}
+
+static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r)
+{
+ return container_of(r, struct rdt_hw_mon_domain, d_resctrl);
}
/**
* struct msr_param - set a range of MSRs from a domain
* @res: The resource to use
+ * @dom: The domain to update
* @low: Beginning index from base MSR
* @high: End index
*/
struct msr_param {
struct rdt_resource *res;
+ struct rdt_ctrl_domain *dom;
u32 low;
u32 high;
};
@@ -443,8 +486,7 @@ struct rdt_hw_resource {
struct rdt_resource r_resctrl;
u32 num_closid;
unsigned int msr_base;
- void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
+ void (*msr_update)(struct msr_param *m);
unsigned int mon_scale;
unsigned int mbm_width;
unsigned int mbm_cfg_mask;
@@ -457,9 +499,9 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r
}
int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d);
+ struct rdt_ctrl_domain *d);
int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d);
+ struct rdt_ctrl_domain *d);
extern struct mutex rdtgroup_mutex;
@@ -492,6 +534,8 @@ static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
+
/*
* To return the common struct rdt_resource, which is contained in struct
* rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource.
@@ -557,27 +601,28 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn);
int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
umode_t mask);
-struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
- struct list_head **pos);
+struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id,
+ struct list_head **pos);
ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int rdtgroup_schemata_show(struct kernfs_open_file *of,
struct seq_file *s, void *v);
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
unsigned long cbm, int closid, bool exclusive);
-unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d,
unsigned long cbm);
enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
int rdtgroup_tasks_assigned(struct rdtgroup *r);
int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm);
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d);
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm);
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d);
int rdt_pseudo_lock_init(void);
void rdt_pseudo_lock_release(void);
int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
-struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r);
+struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r);
int closids_supported(void);
void closid_free(int closid);
int alloc_rmid(u32 closid);
@@ -588,19 +633,19 @@ bool __init rdt_cpu_has(int flag);
void mon_event_count(void *info);
int rdtgroup_mondata_show(struct seq_file *m, void *arg);
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
- struct rdt_domain *d, struct rdtgroup *rdtgrp,
- int evtid, int first);
-void mbm_setup_overflow_handler(struct rdt_domain *dom,
+ struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+ cpumask_t *cpumask, int evtid, int first);
+void mbm_setup_overflow_handler(struct rdt_mon_domain *dom,
unsigned long delay_ms,
int exclude_cpu);
void mbm_handle_overflow(struct work_struct *work);
void __init intel_rdt_mbm_apply_quirk(void);
bool is_mba_sc(struct rdt_resource *r);
-void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms,
+void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
int exclude_cpu);
void cqm_handle_limbo(struct work_struct *work);
-bool has_busy_rmid(struct rdt_domain *d);
-void __check_limbo(struct rdt_domain *d, bool force_free);
+bool has_busy_rmid(struct rdt_mon_domain *d);
+void __check_limbo(struct rdt_mon_domain *d, bool force_free);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
void __init thread_throttle_mode_init(void);
void __init mbm_config_rftype_init(const char *config);
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index c34a35ec0f03..851b561850e0 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -15,6 +15,8 @@
* Software Developer Manual June 2016, volume 3, section 17.17.
*/
+#define pr_fmt(fmt) "resctrl: " fmt
+
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/sizes.h>
@@ -24,6 +26,7 @@
#include <asm/resctrl.h>
#include "internal.h"
+#include "trace.h"
/**
* struct rmid_entry - dirty tracking for all RMID.
@@ -96,6 +99,8 @@ unsigned int resctrl_rmid_realloc_limit;
#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
+static int snc_nodes_per_l3_cache = 1;
+
/*
* The correction factor table is documented in Documentation/arch/x86/resctrl.rst.
* If rmid > rmid threshold, MBM total and local values should be multiplied
@@ -184,7 +189,43 @@ static inline struct rmid_entry *__rmid_entry(u32 idx)
return entry;
}
-static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
+/*
+ * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
+ * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
+ * needed. The physical RMID is the same as the logical RMID.
+ *
+ * On a platform with SNC mode enabled, Linux enables RMID sharing mode
+ * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
+ * Resource Director Technology Architecture Specification" for a full
+ * description of RMID sharing mode).
+ *
+ * In RMID sharing mode there are fewer "logical RMID" values available
+ * to accumulate data ("physical RMIDs" are divided evenly between SNC
+ * nodes that share an L3 cache). Linux creates an rdt_mon_domain for
+ * each SNC node.
+ *
+ * The value loaded into IA32_PQR_ASSOC is the "logical RMID".
+ *
+ * Data is collected independently on each SNC node and can be retrieved
+ * using the "physical RMID" value computed by this function and loaded
+ * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
+ *
+ * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
+ * cache. So a "physical RMID" may be read from any CPU that shares
+ * the L3 cache with the desired SNC node, not just from a CPU in
+ * the specific SNC node.
+ */
+static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ if (snc_nodes_per_l3_cache == 1)
+ return lrmid;
+
+ return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;
+}
+
+static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
{
u64 msr_val;
@@ -196,7 +237,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
* IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
* are error bits.
*/
- wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
rdmsrl(MSR_IA32_QM_CTR, msr_val);
if (msr_val & RMID_VAL_ERROR)
@@ -208,7 +249,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
return 0;
}
-static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
+static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
u32 rmid,
enum resctrl_event_id eventid)
{
@@ -227,19 +268,22 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
return NULL;
}
-void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
u32 unused, u32 rmid,
enum resctrl_event_id eventid)
{
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
struct arch_mbm_state *am;
+ u32 prmid;
am = get_arch_mbm_state(hw_dom, rmid, eventid);
if (am) {
memset(am, 0, sizeof(*am));
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
/* Record any initial, non-zero count value. */
- __rmid_read(rmid, eventid, &am->prev_msr);
+ __rmid_read_phys(prmid, eventid, &am->prev_msr);
}
}
@@ -247,9 +291,9 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
* Assumes that hardware counters are also reset and thus that there is
* no need to record initial non-zero counts.
*/
-void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d)
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
{
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
if (is_mbm_total_enabled())
memset(hw_dom->arch_mbm_total, 0,
@@ -268,22 +312,22 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
return chunks >> shift;
}
-int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
u32 unused, u32 rmid, enum resctrl_event_id eventid,
u64 *val, void *ignored)
{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
struct arch_mbm_state *am;
u64 msr_val, chunks;
+ u32 prmid;
int ret;
resctrl_arch_rmid_read_context_check();
- if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
- return -EINVAL;
-
- ret = __rmid_read(rmid, eventid, &msr_val);
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+ ret = __rmid_read_phys(prmid, eventid, &msr_val);
if (ret)
return ret;
@@ -319,7 +363,7 @@ static void limbo_release_entry(struct rmid_entry *entry)
* decrement the count. If the busy count gets to zero on an RMID, we
* free the RMID
*/
-void __check_limbo(struct rdt_domain *d, bool force_free)
+void __check_limbo(struct rdt_mon_domain *d, bool force_free)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
u32 idx_limit = resctrl_arch_system_num_rmid_idx();
@@ -354,6 +398,16 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
rmid_dirty = true;
} else {
rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
+
+ /*
+ * x86's CLOSID and RMID are independent numbers, so the entry's
+ * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the
+ * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't
+ * used to select the configuration. It is thus necessary to track both
+ * CLOSID and RMID because there may be dependencies between them
+ * on some architectures.
+ */
+ trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val);
}
if (force_free || !rmid_dirty) {
@@ -367,7 +421,7 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);
}
-bool has_busy_rmid(struct rdt_domain *d)
+bool has_busy_rmid(struct rdt_mon_domain *d)
{
u32 idx_limit = resctrl_arch_system_num_rmid_idx();
@@ -468,7 +522,7 @@ int alloc_rmid(u32 closid)
static void add_rmid_to_limbo(struct rmid_entry *entry)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- struct rdt_domain *d;
+ struct rdt_mon_domain *d;
u32 idx;
lockdep_assert_held(&rdtgroup_mutex);
@@ -479,7 +533,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
entry->busy = 0;
- list_for_each_entry(d, &r->domains, list) {
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
/*
* For the first limbo RMID in the domain,
* setup up the limbo worker.
@@ -508,7 +562,8 @@ void free_rmid(u32 closid, u32 rmid)
* allows architectures that ignore the closid parameter to avoid an
* unnecessary check.
*/
- if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
+ if (!resctrl_arch_mon_capable() ||
+ idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
RESCTRL_RESERVED_RMID))
return;
@@ -520,7 +575,7 @@ void free_rmid(u32 closid, u32 rmid)
list_add_tail(&entry->list, &rmid_free_lru);
}
-static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid,
+static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid,
u32 rmid, enum resctrl_event_id evtid)
{
u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
@@ -537,7 +592,10 @@ static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid,
static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
+ int cpu = smp_processor_id();
+ struct rdt_mon_domain *d;
struct mbm_state *m;
+ int err, ret;
u64 tval = 0;
if (rr->first) {
@@ -548,14 +606,47 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
return 0;
}
- rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid,
- &tval, rr->arch_mon_ctx);
- if (rr->err)
- return rr->err;
+ if (rr->d) {
+ /* Reading a single domain, must be on a CPU in that domain. */
+ if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask))
+ return -EINVAL;
+ rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid,
+ rr->evtid, &tval, rr->arch_mon_ctx);
+ if (rr->err)
+ return rr->err;
- rr->val += tval;
+ rr->val += tval;
- return 0;
+ return 0;
+ }
+
+ /* Summing domains that share a cache, must be on a CPU for that cache. */
+ if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
+ return -EINVAL;
+
+ /*
+ * Legacy files must report the sum of an event across all
+ * domains that share the same L3 cache instance.
+ * Report success if a read from any domain succeeds, -EINVAL
+ * (translated to "Unavailable" for user space) if reading from
+ * all domains fail for any reason.
+ */
+ ret = -EINVAL;
+ list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
+ if (d->ci->id != rr->ci->id)
+ continue;
+ err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
+ rr->evtid, &tval, rr->arch_mon_ctx);
+ if (!err) {
+ rr->val += tval;
+ ret = 0;
+ }
+ }
+
+ if (ret)
+ rr->err = ret;
+
+ return ret;
}
/*
@@ -656,12 +747,12 @@ void mon_event_count(void *info)
* throttle MSRs already have low percentage values. To avoid
* unnecessarily restricting such rdtgroups, we also increase the bandwidth.
*/
-static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
{
u32 closid, rmid, cur_msr_val, new_msr_val;
struct mbm_state *pmbm_data, *cmbm_data;
+ struct rdt_ctrl_domain *dom_mba;
struct rdt_resource *r_mba;
- struct rdt_domain *dom_mba;
u32 cur_bw, user_bw, idx;
struct list_head *head;
struct rdtgroup *entry;
@@ -676,7 +767,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
idx = resctrl_arch_rmid_idx_encode(closid, rmid);
pmbm_data = &dom_mbm->mbm_local[idx];
- dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
+ dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);
if (!dom_mba) {
pr_warn_once("Failure to get domain for MBA update\n");
return;
@@ -722,12 +813,11 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
}
-static void mbm_update(struct rdt_resource *r, struct rdt_domain *d,
+static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
u32 closid, u32 rmid)
{
- struct rmid_read rr;
+ struct rmid_read rr = {0};
- rr.first = false;
rr.r = r;
rr.d = d;
@@ -780,17 +870,17 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d,
void cqm_handle_limbo(struct work_struct *work)
{
unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
- struct rdt_domain *d;
+ struct rdt_mon_domain *d;
cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
- d = container_of(work, struct rdt_domain, cqm_limbo.work);
+ d = container_of(work, struct rdt_mon_domain, cqm_limbo.work);
__check_limbo(d, false);
if (has_busy_rmid(d)) {
- d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask,
+ d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
RESCTRL_PICK_ANY_CPU);
schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
delay);
@@ -808,13 +898,13 @@ void cqm_handle_limbo(struct work_struct *work)
* @exclude_cpu: Which CPU the handler should not run on,
* RESCTRL_PICK_ANY_CPU to pick any CPU.
*/
-void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms,
+void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
int exclude_cpu)
{
unsigned long delay = msecs_to_jiffies(delay_ms);
int cpu;
- cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu);
+ cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
dom->cqm_work_cpu = cpu;
if (cpu < nr_cpu_ids)
@@ -825,9 +915,9 @@ void mbm_handle_overflow(struct work_struct *work)
{
unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
struct rdtgroup *prgrp, *crgrp;
+ struct rdt_mon_domain *d;
struct list_head *head;
struct rdt_resource *r;
- struct rdt_domain *d;
cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
@@ -840,7 +930,7 @@ void mbm_handle_overflow(struct work_struct *work)
goto out_unlock;
r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- d = container_of(work, struct rdt_domain, mbm_over.work);
+ d = container_of(work, struct rdt_mon_domain, mbm_over.work);
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
@@ -857,7 +947,7 @@ void mbm_handle_overflow(struct work_struct *work)
* Re-check for housekeeping CPUs. This allows the overflow handler to
* move off a nohz_full CPU quickly.
*/
- d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask,
+ d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
RESCTRL_PICK_ANY_CPU);
schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
@@ -874,7 +964,7 @@ out_unlock:
* @exclude_cpu: Which CPU the handler should not run on,
* RESCTRL_PICK_ANY_CPU to pick any CPU.
*/
-void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms,
+void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
int exclude_cpu)
{
unsigned long delay = msecs_to_jiffies(delay_ms);
@@ -886,7 +976,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms,
*/
if (!resctrl_mounted || !resctrl_arch_mon_capable())
return;
- cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu);
+ cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
dom->mbm_work_cpu = cpu;
if (cpu < nr_cpu_ids)
@@ -1003,6 +1093,88 @@ static void l3_mon_evt_init(struct rdt_resource *r)
list_add_tail(&mbm_local_event.list, &r->evt_list);
}
+/*
+ * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
+ * which indicates that RMIDs are configured in legacy mode.
+ * This mode is incompatible with Linux resctrl semantics
+ * as RMIDs are partitioned between SNC nodes, which requires
+ * a user to know which RMID is allocated to a task.
+ * Clearing bit 0 reconfigures the RMID counters for use
+ * in RMID sharing mode. This mode is better for Linux.
+ * The RMID space is divided between all SNC nodes with the
+ * RMIDs renumbered to start from zero in each node when
+ * counting operations from tasks. Code to read the counters
+ * must adjust RMID counter numbers based on SNC node. See
+ * logical_rmid_to_physical_rmid() for code that does this.
+ */
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ if (snc_nodes_per_l3_cache > 1)
+ msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
+}
+
+/* CPU models that support MSR_RMID_SNC_CONFIG */
+static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
+ {}
+};
+
+/*
+ * There isn't a simple hardware bit that indicates whether a CPU is running
+ * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
+ * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
+ * the same NUMA node as CPU0.
+ * It is not possible to accurately determine SNC state if the system is
+ * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
+ * to L3 caches. It will be OK if system is booted with hyperthreading
+ * disabled (since this doesn't affect the ratio).
+ */
+static __init int snc_get_config(void)
+{
+ struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
+ const cpumask_t *node0_cpumask;
+ int cpus_per_node, cpus_per_l3;
+ int ret;
+
+ if (!x86_match_cpu(snc_cpu_ids) || !ci)
+ return 1;
+
+ cpus_read_lock();
+ if (num_online_cpus() != num_present_cpus())
+ pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
+ cpus_read_unlock();
+
+ node0_cpumask = cpumask_of_node(cpu_to_node(0));
+
+ cpus_per_node = cpumask_weight(node0_cpumask);
+ cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
+
+ if (!cpus_per_node || !cpus_per_l3)
+ return 1;
+
+ ret = cpus_per_l3 / cpus_per_node;
+
+ /* sanity check: Only valid results are 1, 2, 3, 4 */
+ switch (ret) {
+ case 1:
+ break;
+ case 2 ... 4:
+ pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
+ rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
+ break;
+ default:
+ pr_warn("Ignore improbable SNC node count %d\n", ret);
+ ret = 1;
+ break;
+ }
+
+ return ret;
+}
+
int __init rdt_get_mon_l3_config(struct rdt_resource *r)
{
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
@@ -1010,9 +1182,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
unsigned int threshold;
int ret;
+ snc_nodes_per_l3_cache = snc_get_config();
+
resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
- hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
- r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+ hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
+ r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 884b88e25141..972e6b6b0481 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -11,7 +11,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/debugfs.h>
@@ -23,7 +22,7 @@
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/resctrl.h>
#include <asm/perf_event.h>
@@ -31,7 +30,7 @@
#include "internal.h"
#define CREATE_TRACE_POINTS
-#include "pseudo_lock_event.h"
+#include "trace.h"
/*
* The bits needed to disable hardware prefetching varies based on the
@@ -88,8 +87,8 @@ static u64 get_prefetch_disable_bits(void)
boot_cpu_data.x86 != 6)
return 0;
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -100,8 +99,8 @@ static u64 get_prefetch_disable_bits(void)
* 63:4 Reserved
*/
return 0xF;
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -221,7 +220,7 @@ static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
int cpu;
int ret;
- for_each_cpu(cpu, &plr->d->cpu_mask) {
+ for_each_cpu(cpu, &plr->d->hdr.cpu_mask) {
pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
if (!pm_req) {
rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n");
@@ -292,12 +291,15 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
*/
static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
{
- struct cpu_cacheinfo *ci;
+ enum resctrl_scope scope = plr->s->res->ctrl_scope;
+ struct cacheinfo *ci;
int ret;
- int i;
+
+ if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE))
+ return -ENODEV;
/* Pick the first cpu we find that is associated with the cache. */
- plr->cpu = cpumask_first(&plr->d->cpu_mask);
+ plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask);
if (!cpu_online(plr->cpu)) {
rdt_last_cmd_printf("CPU %u associated with cache not online\n",
@@ -306,15 +308,11 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
goto out_region;
}
- ci = get_cpu_cacheinfo(plr->cpu);
-
- plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
-
- for (i = 0; i < ci->num_leaves; i++) {
- if (ci->info_list[i].level == plr->s->res->cache_level) {
- plr->line_size = ci->info_list[i].coherency_line_size;
- return 0;
- }
+ ci = get_cpu_cacheinfo_level(plr->cpu, scope);
+ if (ci) {
+ plr->line_size = ci->coherency_line_size;
+ plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
+ return 0;
}
ret = -1;
@@ -810,7 +808,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
* Return: true if @cbm overlaps with pseudo-locked region on @d, false
* otherwise.
*/
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm)
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
{
unsigned int cbm_len;
unsigned long cbm_b;
@@ -837,11 +835,11 @@ bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm
* if it is not possible to test due to memory allocation issue,
* false otherwise.
*/
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
{
+ struct rdt_ctrl_domain *d_i;
cpumask_var_t cpu_with_psl;
struct rdt_resource *r;
- struct rdt_domain *d_i;
bool ret = false;
/* Walking r->domains, ensure it can't race with cpuhp */
@@ -855,10 +853,10 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
* associated with them.
*/
for_each_alloc_capable_rdt_resource(r) {
- list_for_each_entry(d_i, &r->domains, list) {
+ list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) {
if (d_i->plr)
cpumask_or(cpu_with_psl, cpu_with_psl,
- &d_i->cpu_mask);
+ &d_i->hdr.cpu_mask);
}
}
@@ -866,7 +864,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
* Next test if new pseudo-locked region would intersect with
* existing region.
*/
- if (cpumask_intersects(&d->cpu_mask, cpu_with_psl))
+ if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl))
ret = true;
free_cpumask_var(cpu_with_psl);
@@ -1084,9 +1082,9 @@ static int measure_l2_residency(void *_plr)
* L2_HIT 02H
* L2_MISS 10H
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
.umask = 0x10);
perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
@@ -1123,8 +1121,8 @@ static int measure_l3_residency(void *_plr)
* MISS 41H
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
/* On BDW the hit event counts references, not hits */
perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
.umask = 0x4f);
@@ -1142,7 +1140,7 @@ static int measure_l3_residency(void *_plr)
*/
counts.miss_after -= counts.miss_before;
- if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) {
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
/*
* On BDW references and misses are counted, need to adjust.
* Sometimes the "hits" counter is a bit more than the
@@ -1198,7 +1196,7 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
}
plr->thread_done = 0;
- cpu = cpumask_first(&plr->d->cpu_mask);
+ cpu = cpumask_first(&plr->d->hdr.cpu_mask);
if (!cpu_online(cpu)) {
ret = -ENODEV;
goto out;
@@ -1528,7 +1526,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
* may be scheduled elsewhere and invalidate entries in the
* pseudo-locked region.
*/
- if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
+ if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) {
mutex_unlock(&rdtgroup_mutex);
return -EINVAL;
}
@@ -1569,7 +1567,6 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
static const struct file_operations pseudo_lock_dev_fops = {
.owner = THIS_MODULE,
- .llseek = no_llseek,
.read = NULL,
.write = NULL,
.open = pseudo_lock_dev_open,
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 011e17efb1a6..d7163b764c62 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -12,7 +12,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <linux/fs.h>
@@ -92,13 +91,13 @@ void rdt_last_cmd_printf(const char *fmt, ...)
void rdt_staged_configs_clear(void)
{
+ struct rdt_ctrl_domain *dom;
struct rdt_resource *r;
- struct rdt_domain *dom;
lockdep_assert_held(&rdtgroup_mutex);
for_each_alloc_capable_rdt_resource(r) {
- list_for_each_entry(dom, &r->domains, list)
+ list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
memset(dom->staged_config, 0, sizeof(dom->staged_config));
}
}
@@ -317,7 +316,7 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
rdt_last_cmd_puts("Cache domain offline\n");
ret = -ENODEV;
} else {
- mask = &rdtgrp->plr->d->cpu_mask;
+ mask = &rdtgrp->plr->d->hdr.cpu_mask;
seq_printf(s, is_cpu_list(of) ?
"%*pbl\n" : "%*pb\n",
cpumask_pr_args(mask));
@@ -1012,7 +1011,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
unsigned long sw_shareable = 0, hw_shareable = 0;
unsigned long exclusive = 0, pseudo_locked = 0;
struct rdt_resource *r = s->res;
- struct rdt_domain *dom;
+ struct rdt_ctrl_domain *dom;
int i, hwb, swb, excl, psl;
enum rdtgrp_mode mode;
bool sep = false;
@@ -1021,12 +1020,12 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
hw_shareable = r->cache.shareable_bits;
- list_for_each_entry(dom, &r->domains, list) {
+ list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
if (sep)
seq_putc(seq, ';');
sw_shareable = 0;
exclusive = 0;
- seq_printf(seq, "%d=", dom->id);
+ seq_printf(seq, "%d=", dom->hdr.id);
for (i = 0; i < closids_supported(); i++) {
if (!closid_allocated(i))
continue;
@@ -1243,7 +1242,7 @@ static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
*
* Return: false if CBM does not overlap, true if it does.
*/
-static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
unsigned long cbm, int closid,
enum resctrl_conf_type type, bool exclusive)
{
@@ -1298,7 +1297,7 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
*
* Return: true if CBM overlap detected, false if there is no overlap
*/
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
unsigned long cbm, int closid, bool exclusive)
{
enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
@@ -1329,10 +1328,10 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
{
int closid = rdtgrp->closid;
+ struct rdt_ctrl_domain *d;
struct resctrl_schema *s;
struct rdt_resource *r;
bool has_cache = false;
- struct rdt_domain *d;
u32 ctrl;
/* Walking r->domains, ensure it can't race with cpuhp */
@@ -1343,7 +1342,7 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
continue;
has_cache = true;
- list_for_each_entry(d, &r->domains, list) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
ctrl = resctrl_arch_get_config(r, d, closid,
s->conf_type);
if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
@@ -1448,20 +1447,19 @@ out:
* bitmap functions work correctly.
*/
unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
- struct rdt_domain *d, unsigned long cbm)
+ struct rdt_ctrl_domain *d, unsigned long cbm)
{
- struct cpu_cacheinfo *ci;
unsigned int size = 0;
- int num_b, i;
+ struct cacheinfo *ci;
+ int num_b;
+
+ if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
+ return size;
num_b = bitmap_weight(&cbm, r->cache.cbm_len);
- ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
- for (i = 0; i < ci->num_leaves; i++) {
- if (ci->info_list[i].level == r->cache_level) {
- size = ci->info_list[i].size / r->cache.cbm_len * num_b;
- break;
- }
- }
+ ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
+ if (ci)
+ size = ci->size / r->cache.cbm_len * num_b;
return size;
}
@@ -1477,9 +1475,9 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
{
struct resctrl_schema *schema;
enum resctrl_conf_type type;
+ struct rdt_ctrl_domain *d;
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
- struct rdt_domain *d;
unsigned int size;
int ret = 0;
u32 closid;
@@ -1503,7 +1501,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
rdtgrp->plr->d,
rdtgrp->plr->cbm);
- seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
+ seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
}
goto out;
}
@@ -1515,7 +1513,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
type = schema->conf_type;
sep = false;
seq_printf(s, "%*s:", max_name_width, schema->name);
- list_for_each_entry(d, &r->domains, list) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
if (sep)
seq_putc(s, ';');
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
@@ -1533,7 +1531,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
else
size = rdtgroup_cbm_to_size(r, d, ctrl);
}
- seq_printf(s, "%d=%u", d->id, size);
+ seq_printf(s, "%d=%u", d->hdr.id, size);
sep = true;
}
seq_putc(s, '\n');
@@ -1591,21 +1589,21 @@ static void mon_event_config_read(void *info)
mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
}
-static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info)
+static void mondata_config_read(struct rdt_mon_domain *d, struct mon_config_info *mon_info)
{
- smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1);
+ smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_read, mon_info, 1);
}
static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
{
struct mon_config_info mon_info = {0};
- struct rdt_domain *dom;
+ struct rdt_mon_domain *dom;
bool sep = false;
cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
- list_for_each_entry(dom, &r->domains, list) {
+ list_for_each_entry(dom, &r->mon_domains, hdr.list) {
if (sep)
seq_puts(s, ";");
@@ -1613,7 +1611,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid
mon_info.evtid = evtid;
mondata_config_read(dom, &mon_info);
- seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config);
+ seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
sep = true;
}
seq_puts(s, "\n");
@@ -1658,7 +1656,7 @@ static void mon_event_config_write(void *info)
}
static void mbm_config_write_domain(struct rdt_resource *r,
- struct rdt_domain *d, u32 evtid, u32 val)
+ struct rdt_mon_domain *d, u32 evtid, u32 val)
{
struct mon_config_info mon_info = {0};
@@ -1679,7 +1677,7 @@ static void mbm_config_write_domain(struct rdt_resource *r,
* are scoped at the domain level. Writing any of these MSRs
* on one CPU is observed by all the CPUs in the domain.
*/
- smp_call_function_any(&d->cpu_mask, mon_event_config_write,
+ smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_write,
&mon_info, 1);
/*
@@ -1699,7 +1697,7 @@ static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
char *dom_str = NULL, *id_str;
unsigned long dom_id, val;
- struct rdt_domain *d;
+ struct rdt_mon_domain *d;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
@@ -1729,8 +1727,8 @@ next:
return -EINVAL;
}
- list_for_each_entry(d, &r->domains, list) {
- if (d->id == dom_id) {
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ if (d->hdr.id == dom_id) {
mbm_config_write_domain(r, d, evtid, val);
goto next;
}
@@ -2258,9 +2256,9 @@ static inline bool is_mba_linear(void)
static int set_cache_qos_cfg(int level, bool enable)
{
void (*update)(void *arg);
+ struct rdt_ctrl_domain *d;
struct rdt_resource *r_l;
cpumask_var_t cpu_mask;
- struct rdt_domain *d;
int cpu;
/* Walking r->domains, ensure it can't race with cpuhp */
@@ -2277,14 +2275,14 @@ static int set_cache_qos_cfg(int level, bool enable)
return -ENOMEM;
r_l = &rdt_resources_all[level].r_resctrl;
- list_for_each_entry(d, &r_l->domains, list) {
+ list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) {
if (r_l->cache.arch_has_per_cpu_cfg)
/* Pick all the CPUs in the domain instance */
- for_each_cpu(cpu, &d->cpu_mask)
+ for_each_cpu(cpu, &d->hdr.cpu_mask)
cpumask_set_cpu(cpu, cpu_mask);
else
/* Pick one CPU from each domain instance to update MSR */
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask);
}
/* Update QOS_CFG MSR on all the CPUs in cpu_mask */
@@ -2310,10 +2308,10 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
l3_qos_cfg_update(&hw_res->cdp_enabled);
}
-static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d)
+static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
u32 num_closid = resctrl_arch_get_num_closid(r);
- int cpu = cpumask_any(&d->cpu_mask);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
int i;
d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
@@ -2328,7 +2326,7 @@ static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d)
}
static void mba_sc_domain_destroy(struct rdt_resource *r,
- struct rdt_domain *d)
+ struct rdt_ctrl_domain *d)
{
kfree(d->mbps_val);
d->mbps_val = NULL;
@@ -2336,14 +2334,18 @@ static void mba_sc_domain_destroy(struct rdt_resource *r,
/*
* MBA software controller is supported only if
- * MBM is supported and MBA is in linear scale.
+ * MBM is supported and MBA is in linear scale,
+ * and the MBM monitor scope is the same as MBA
+ * control scope.
*/
static bool supports_mba_mbps(void)
{
+ struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
return (is_mbm_local_enabled() &&
- r->alloc_capable && is_mba_linear());
+ r->alloc_capable && is_mba_linear() &&
+ r->ctrl_scope == rmbm->mon_scope);
}
/*
@@ -2354,7 +2356,7 @@ static int set_mba_sc(bool mba_sc)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
u32 num_closid = resctrl_arch_get_num_closid(r);
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
int i;
if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
@@ -2362,7 +2364,7 @@ static int set_mba_sc(bool mba_sc)
r->membw.mba_sc = mba_sc;
- list_for_each_entry(d, &r->domains, list) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
for (i = 0; i < num_closid; i++)
d->mbps_val[i] = MBA_MAX_MBPS;
}
@@ -2626,7 +2628,7 @@ static int rdt_get_tree(struct fs_context *fc)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
unsigned long flags = RFTYPE_CTRL_BASE;
- struct rdt_domain *dom;
+ struct rdt_mon_domain *dom;
struct rdt_resource *r;
int ret;
@@ -2701,7 +2703,7 @@ static int rdt_get_tree(struct fs_context *fc)
if (is_mbm_enabled()) {
r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- list_for_each_entry(dom, &r->domains, list)
+ list_for_each_entry(dom, &r->mon_domains, hdr.list)
mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
RESCTRL_PICK_ANY_CPU);
}
@@ -2751,6 +2753,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
struct fs_parse_result result;
+ const char *msg;
int opt;
opt = fs_parse(fc, rdt_fs_parameters, param, &result);
@@ -2765,8 +2768,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->enable_cdpl2 = true;
return 0;
case Opt_mba_mbps:
+ msg = "mba_MBps requires local MBM and linear scale MBA at L3 scope";
if (!supports_mba_mbps())
- return -EINVAL;
+ return invalfc(fc, msg);
ctx->enable_mba_mbps = true;
return 0;
case Opt_debug:
@@ -2811,40 +2815,32 @@ static int rdt_init_fs_context(struct fs_context *fc)
static int reset_all_ctrls(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom;
+ struct rdt_hw_ctrl_domain *hw_dom;
struct msr_param msr_param;
- cpumask_var_t cpu_mask;
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
int i;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
-
msr_param.res = r;
msr_param.low = 0;
msr_param.high = hw_res->num_closid;
/*
* Disable resource control for this resource by setting all
- * CBMs in all domains to the maximum mask value. Pick one CPU
+ * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU
* from each domain to update the MSRs below.
*/
- list_for_each_entry(d, &r->domains, list) {
- hw_dom = resctrl_to_arch_dom(d);
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
for (i = 0; i < hw_res->num_closid; i++)
hw_dom->ctrl_val[i] = r->default_ctrl;
+ msr_param.dom = d;
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- /* Update CBM on all the CPUs in cpu_mask */
- on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-
- free_cpumask_var(cpu_mask);
-
return 0;
}
@@ -3010,62 +3006,126 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
return ret;
}
+static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
+{
+ struct kernfs_node *kn;
+
+ kn = kernfs_find_and_get(pkn, name);
+ if (!kn)
+ return;
+ kernfs_put(kn);
+
+ if (kn->dir.subdirs <= 1)
+ kernfs_remove(kn);
+ else
+ kernfs_remove_by_name(kn, subname);
+}
+
/*
* Remove all subdirectories of mon_data of ctrl_mon groups
- * and monitor groups with given domain id.
+ * and monitor groups for the given domain.
+ * Remove files and directories containing "sum" of domain data
+ * when last domain being summed is removed.
*/
static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- unsigned int dom_id)
+ struct rdt_mon_domain *d)
{
struct rdtgroup *prgrp, *crgrp;
+ char subname[32];
+ bool snc_mode;
char name[32];
+ snc_mode = r->mon_scope == RESCTRL_L3_NODE;
+ sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+ if (snc_mode)
+ sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
+
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- sprintf(name, "mon_%s_%02d", r->name, dom_id);
- kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
+ mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
- kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
+ mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
}
}
-static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
- struct rdt_domain *d,
- struct rdt_resource *r, struct rdtgroup *prgrp)
+static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
+ struct rdt_resource *r, struct rdtgroup *prgrp,
+ bool do_sum)
{
+ struct rmid_read rr = {0};
union mon_data_bits priv;
- struct kernfs_node *kn;
struct mon_evt *mevt;
- struct rmid_read rr;
- char name[32];
int ret;
- sprintf(name, "mon_%s_%02d", r->name, d->id);
- /* create the directory */
- kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret)
- goto out_destroy;
-
- if (WARN_ON(list_empty(&r->evt_list))) {
- ret = -EPERM;
- goto out_destroy;
- }
+ if (WARN_ON(list_empty(&r->evt_list)))
+ return -EPERM;
priv.u.rid = r->rid;
- priv.u.domid = d->id;
+ priv.u.domid = do_sum ? d->ci->id : d->hdr.id;
+ priv.u.sum = do_sum;
list_for_each_entry(mevt, &r->evt_list, list) {
priv.u.evtid = mevt->evtid;
ret = mon_addfile(kn, mevt->name, priv.priv);
if (ret)
+ return ret;
+
+ if (!do_sum && is_mbm_event(mevt->evtid))
+ mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
+ }
+
+ return 0;
+}
+
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
+ struct rdt_mon_domain *d,
+ struct rdt_resource *r, struct rdtgroup *prgrp)
+{
+ struct kernfs_node *kn, *ckn;
+ char name[32];
+ bool snc_mode;
+ int ret = 0;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ snc_mode = r->mon_scope == RESCTRL_L3_NODE;
+ sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+ kn = kernfs_find_and_get(parent_kn, name);
+ if (kn) {
+ /*
+ * rdtgroup_mutex will prevent this directory from being
+ * removed. No need to keep this hold.
+ */
+ kernfs_put(kn);
+ } else {
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
goto out_destroy;
+ ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
+ if (ret)
+ goto out_destroy;
+ }
- if (is_mbm_event(mevt->evtid))
- mon_event_read(&rr, r, d, prgrp, mevt->evtid, true);
+ if (snc_mode) {
+ sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
+ ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(ckn)) {
+ ret = -EINVAL;
+ goto out_destroy;
+ }
+
+ ret = rdtgroup_kn_set_ugid(ckn);
+ if (ret)
+ goto out_destroy;
+
+ ret = mon_add_all_files(ckn, d, r, prgrp, false);
+ if (ret)
+ goto out_destroy;
}
+
kernfs_activate(kn);
return 0;
@@ -3079,7 +3139,7 @@ out_destroy:
* and "monitor" groups with given domain id.
*/
static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- struct rdt_domain *d)
+ struct rdt_mon_domain *d)
{
struct kernfs_node *parent_kn;
struct rdtgroup *prgrp, *crgrp;
@@ -3101,13 +3161,13 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
struct rdt_resource *r,
struct rdtgroup *prgrp)
{
- struct rdt_domain *dom;
+ struct rdt_mon_domain *dom;
int ret;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
- list_for_each_entry(dom, &r->domains, list) {
+ list_for_each_entry(dom, &r->mon_domains, hdr.list) {
ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
if (ret)
return ret;
@@ -3206,7 +3266,7 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
* Set the RDT domain up to start off with all usable allocations. That is,
* all shareable and unused bits. All-zero CBM is invalid.
*/
-static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
+static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
u32 closid)
{
enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
@@ -3266,7 +3326,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
*/
tmp_cbm = cfg->new_ctrl;
if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
- rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id);
+ rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
return -ENOSPC;
}
cfg->have_new_ctrl = true;
@@ -3286,10 +3346,10 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
*/
static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
{
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
int ret;
- list_for_each_entry(d, &s->res->domains, list) {
+ list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
ret = __init_one_rdt_domain(d, s, closid);
if (ret < 0)
return ret;
@@ -3302,9 +3362,9 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
{
struct resctrl_staged_config *cfg;
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
- list_for_each_entry(d, &r->domains, list) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
if (is_mba_sc(r)) {
d->mbps_val[closid] = MBA_MAX_MBPS;
continue;
@@ -3928,29 +3988,33 @@ static void __init rdtgroup_setup_default(void)
mutex_unlock(&rdtgroup_mutex);
}
-static void domain_destroy_mon_state(struct rdt_domain *d)
+static void domain_destroy_mon_state(struct rdt_mon_domain *d)
{
bitmap_free(d->rmid_busy_llc);
kfree(d->mbm_total);
kfree(d->mbm_local);
}
-void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
+void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
mutex_lock(&rdtgroup_mutex);
if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
mba_sc_domain_destroy(r, d);
- if (!r->mon_capable)
- goto out_unlock;
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ mutex_lock(&rdtgroup_mutex);
/*
* If resctrl is mounted, remove all the
* per domain monitor data directories.
*/
if (resctrl_mounted && resctrl_arch_mon_capable())
- rmdir_mondata_subdir_allrdtgrp(r, d->id);
+ rmdir_mondata_subdir_allrdtgrp(r, d);
if (is_mbm_enabled())
cancel_delayed_work(&d->mbm_over);
@@ -3969,11 +4033,10 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
domain_destroy_mon_state(d);
-out_unlock:
mutex_unlock(&rdtgroup_mutex);
}
-static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
{
u32 idx_limit = resctrl_arch_system_num_rmid_idx();
size_t tsize;
@@ -4004,7 +4067,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
return 0;
}
-int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
+int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
int err = 0;
@@ -4013,11 +4076,18 @@ int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
/* RDT_RESOURCE_MBA is never mon_capable */
err = mba_sc_domain_allocate(r, d);
- goto out_unlock;
}
- if (!r->mon_capable)
- goto out_unlock;
+ mutex_unlock(&rdtgroup_mutex);
+
+ return err;
+}
+
+int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ int err;
+
+ mutex_lock(&rdtgroup_mutex);
err = domain_setup_mon_state(r, d);
if (err)
@@ -4068,8 +4138,8 @@ static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
void resctrl_offline_cpu(unsigned int cpu)
{
struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdt_mon_domain *d;
struct rdtgroup *rdtgrp;
- struct rdt_domain *d;
mutex_lock(&rdtgroup_mutex);
list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
@@ -4082,7 +4152,7 @@ void resctrl_offline_cpu(unsigned int cpu)
if (!l3->mon_capable)
goto out_unlock;
- d = get_domain_from_cpu(cpu, l3);
+ d = get_mon_domain_from_cpu(cpu, l3);
if (d) {
if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
cancel_delayed_work(&d->mbm_over);
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h b/arch/x86/kernel/cpu/resctrl/trace.h
index 428ebbd4270b..2a506316b303 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h
+++ b/arch/x86/kernel/cpu/resctrl/trace.h
@@ -2,8 +2,8 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM resctrl
-#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_PSEUDO_LOCK_H
+#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RESCTRL_H
#include <linux/tracepoint.h>
@@ -35,9 +35,25 @@ TRACE_EVENT(pseudo_lock_l3,
TP_printk("hits=%llu miss=%llu",
__entry->l3_hits, __entry->l3_miss));
-#endif /* _TRACE_PSEUDO_LOCK_H */
+TRACE_EVENT(mon_llc_occupancy_limbo,
+ TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes),
+ TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes),
+ TP_STRUCT__entry(__field(u32, ctrl_hw_id)
+ __field(u32, mon_hw_id)
+ __field(int, domain_id)
+ __field(u64, llc_occupancy_bytes)),
+ TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id;
+ __entry->mon_hw_id = mon_hw_id;
+ __entry->domain_id = domain_id;
+ __entry->llc_occupancy_bytes = llc_occupancy_bytes;),
+ TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu",
+ __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id,
+ __entry->llc_occupancy_bytes)
+ );
+
+#endif /* _TRACE_RESCTRL_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE pseudo_lock_event
+#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index af5aa2c754c2..c84c30188fdf 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -45,6 +45,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { X86_FEATURE_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
{ X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
{ X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 262f5fb18d74..22b65a5f5ec6 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file,
if (flags & MAP_FIXED)
return addr;
- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
}
#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 166692f2d501..9ace84486499 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -13,6 +13,7 @@
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
@@ -474,24 +475,25 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void)
{
struct sgx_epc_page *page;
int nid_of_current = numa_node_id();
- int nid = nid_of_current;
+ int nid_start, nid;
- if (node_isset(nid_of_current, sgx_numa_mask)) {
- page = __sgx_alloc_epc_page_from_node(nid_of_current);
- if (page)
- return page;
- }
-
- /* Fall back to the non-local NUMA nodes: */
- while (true) {
- nid = next_node_in(nid, sgx_numa_mask);
- if (nid == nid_of_current)
- break;
+ /*
+ * Try local node first. If it doesn't have an EPC section,
+ * fall back to the non-local NUMA nodes.
+ */
+ if (node_isset(nid_of_current, sgx_numa_mask))
+ nid_start = nid_of_current;
+ else
+ nid_start = next_node_in(nid_of_current, sgx_numa_mask);
+ nid = nid_start;
+ do {
page = __sgx_alloc_epc_page_from_node(nid);
if (page)
return page;
- }
+
+ nid = next_node_in(nid, sgx_numa_mask);
+ } while (nid != nid_start);
return ERR_PTR(-ENOMEM);
}
@@ -731,7 +733,7 @@ out:
return 0;
}
-/**
+/*
* A section metric is concatenated in a way that @low bits 12-31 define the
* bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
* metric.
@@ -846,6 +848,13 @@ static bool __init sgx_page_cache_init(void)
return false;
}
+ for_each_online_node(nid) {
+ if (!node_isset(nid, sgx_numa_mask) &&
+ node_state(nid, N_MEMORY) && node_state(nid, N_CPU))
+ pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n",
+ nid);
+ }
+
return true;
}
@@ -894,10 +903,10 @@ int sgx_set_attribute(unsigned long *allowed_attributes,
{
struct fd f = fdget(attribute_fd);
- if (!f.file)
+ if (!fd_file(f))
return -EINVAL;
- if (f.file->f_op != &sgx_provision_fops) {
+ if (fd_file(f)->f_op != &sgx_provision_fops) {
fdput(f);
return -EINVAL;
}
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index d17c9b71eb4a..621a151ccf7d 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -128,6 +128,9 @@ static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
static __init bool check_for_real_bsp(u32 apic_id)
{
+ bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
+ u64 msr;
+
/*
* There is no real good way to detect whether this a kdump()
* kernel, but except on the Voyager SMP monstrosity which is not
@@ -144,17 +147,61 @@ static __init bool check_for_real_bsp(u32 apic_id)
if (topo_info.real_bsp_apic_id != BAD_APICID)
return false;
+ /*
+ * Check whether the enumeration order is broken by evaluating the
+ * BSP bit in the APICBASE MSR. If the CPU does not have the
+ * APICBASE MSR then the BSP detection is not possible and the
+ * kernel must rely on the firmware enumeration order.
+ */
+ if (has_apic_base) {
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
+ }
+
if (apic_id == topo_info.boot_cpu_apic_id) {
- topo_info.real_bsp_apic_id = apic_id;
- return false;
+ /*
+ * If the boot CPU has the APIC BSP bit set then the
+ * firmware enumeration is agreeing. If the CPU does not
+ * have the APICBASE MSR then the only choice is to trust
+ * the enumeration order.
+ */
+ if (is_bsp || !has_apic_base) {
+ topo_info.real_bsp_apic_id = apic_id;
+ return false;
+ }
+ /*
+ * If the boot APIC is enumerated first, but the APICBASE
+ * MSR does not have the BSP bit set, then there is no way
+ * to discover the real BSP here. Assume a crash kernel and
+ * limit the number of CPUs to 1 as an INIT to the real BSP
+ * would reset the machine.
+ */
+ pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
+ pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
+ set_nr_cpu_ids(1);
+ goto fwbug;
}
- pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x > %x\n",
+ pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
topo_info.boot_cpu_apic_id, apic_id);
+
+ if (is_bsp) {
+ /*
+ * The boot CPU has the APIC BSP bit set. Use it and complain
+ * about the broken firmware enumeration.
+ */
+ topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
+ goto fwbug;
+ }
+
pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");
topo_info.real_bsp_apic_id = apic_id;
return true;
+
+fwbug:
+ pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
+ return false;
}
static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
index a7aa6eff4ae5..7d476fa697ca 100644
--- a/arch/x86/kernel/cpu/topology_amd.c
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -58,7 +58,7 @@ static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id)
tscan->amd_node_id = node_id;
}
-static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb)
+static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext)
{
struct {
// eax
@@ -84,9 +84,9 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb)
/*
* If leaf 0xb is available, then the domain shifts are set
- * already and nothing to do here.
+ * already and nothing to do here. Only valid for family >= 0x17.
*/
- if (!has_0xb) {
+ if (!has_topoext && tscan->c->x86 >= 0x17) {
/*
* Leaf 0x80000008 set the CORE domain shift already.
* Update the SMT domain, but do not propagate it.
@@ -119,7 +119,7 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb)
return true;
}
-static bool parse_fam10h_node_id(struct topo_scan *tscan)
+static void parse_fam10h_node_id(struct topo_scan *tscan)
{
union {
struct {
@@ -131,20 +131,20 @@ static bool parse_fam10h_node_id(struct topo_scan *tscan)
} nid;
if (!boot_cpu_has(X86_FEATURE_NODEID_MSR))
- return false;
+ return;
rdmsrl(MSR_FAM10H_NODE_ID, nid.msr);
store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id);
tscan->c->topo.llc_id = nid.node_id;
- return true;
}
static void legacy_set_llc(struct topo_scan *tscan)
{
unsigned int apicid = tscan->c->topo.initial_apicid;
- /* parse_8000_0008() set everything up except llc_id */
- tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN];
+ /* If none of the parsers set LLC ID then use the die ID for it. */
+ if (tscan->c->topo.llc_id == BAD_APICID)
+ tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN];
}
static void topoext_fixup(struct topo_scan *tscan)
@@ -169,28 +169,28 @@ static void topoext_fixup(struct topo_scan *tscan)
static void parse_topology_amd(struct topo_scan *tscan)
{
- bool has_0xb = false;
+ bool has_topoext = false;
/*
* If the extended topology leaf 0x8000_001e is available
- * try to get SMT and CORE shift from leaf 0xb first, then
- * try to get the CORE shift from leaf 0x8000_0008.
+ * try to get SMT, CORE, TILE, and DIE shifts from extended
+ * CPUID leaf 0x8000_0026 on supported processors first. If
+ * extended CPUID leaf 0x8000_0026 is not supported, try to
+ * get SMT and CORE shift from leaf 0xb first, then try to
+ * get the CORE shift from leaf 0x8000_0008.
*/
if (cpu_feature_enabled(X86_FEATURE_TOPOEXT))
- has_0xb = cpu_parse_topology_ext(tscan);
+ has_topoext = cpu_parse_topology_ext(tscan);
- if (!has_0xb && !parse_8000_0008(tscan))
+ if (!has_topoext && !parse_8000_0008(tscan))
return;
/* Prefer leaf 0x8000001e if available */
- if (parse_8000_001e(tscan, has_0xb))
+ if (parse_8000_001e(tscan, has_topoext))
return;
/* Try the NODEID MSR */
- if (parse_fam10h_node_id(tscan))
- return;
-
- legacy_set_llc(tscan);
+ parse_fam10h_node_id(tscan);
}
void cpu_parse_topology_amd(struct topo_scan *tscan)
@@ -198,6 +198,7 @@ void cpu_parse_topology_amd(struct topo_scan *tscan)
tscan->amd_nodes_per_pkg = 1;
topoext_fixup(tscan);
parse_topology_amd(tscan);
+ legacy_set_llc(tscan);
if (tscan->amd_nodes_per_pkg > 1)
set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM);
diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c
index e477228cd5b2..467b0326bf1a 100644
--- a/arch/x86/kernel/cpu/topology_ext.c
+++ b/arch/x86/kernel/cpu/topology_ext.c
@@ -13,7 +13,10 @@ enum topo_types {
CORE_TYPE = 2,
MAX_TYPE_0B = 3,
MODULE_TYPE = 3,
+ AMD_CCD_TYPE = 3,
TILE_TYPE = 4,
+ AMD_SOCKET_TYPE = 4,
+ MAX_TYPE_80000026 = 5,
DIE_TYPE = 5,
DIEGRP_TYPE = 6,
MAX_TYPE_1F = 7,
@@ -32,6 +35,13 @@ static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = {
[DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN,
};
+static const unsigned int topo_domain_map_80000026[MAX_TYPE_80000026] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [AMD_CCD_TYPE] = TOPO_TILE_DOMAIN,
+ [AMD_SOCKET_TYPE] = TOPO_DIE_DOMAIN,
+};
+
static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
unsigned int *last_dom)
{
@@ -56,6 +66,7 @@ static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
switch (leaf) {
case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break;
case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break;
+ case 0x80000026: maxtype = MAX_TYPE_80000026; map = topo_domain_map_80000026; break;
default: return false;
}
@@ -125,6 +136,10 @@ bool cpu_parse_topology_ext(struct topo_scan *tscan)
if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f))
return true;
+ /* AMD: Try leaf 0x80000026 first. */
+ if (tscan->c->extended_cpuid_level >= 0x80000026 && parse_topology_leaf(tscan, 0x80000026))
+ return true;
+
/* Intel/AMD: Fall back to leaf 0xB if available */
return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b);
}
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 11f83d07925e..00189cdeb775 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -41,80 +41,97 @@
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define CPUID_VMWARE_FEATURES_LEAF 0x40000010
-#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0)
-#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1)
-#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
-
-#define VMWARE_CMD_GETVERSION 10
-#define VMWARE_CMD_GETHZ 45
-#define VMWARE_CMD_GETVCPU_INFO 68
-#define VMWARE_CMD_LEGACY_X2APIC 3
-#define VMWARE_CMD_VCPU_RESERVED 31
-#define VMWARE_CMD_STEALCLOCK 91
+#define GETVCPU_INFO_LEGACY_X2APIC BIT(3)
+#define GETVCPU_INFO_VCPU_RESERVED BIT(31)
#define STEALCLOCK_NOT_AVAILABLE (-1)
#define STEALCLOCK_DISABLED 0
#define STEALCLOCK_ENABLED 1
-#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
- __asm__("inl (%%dx), %%eax" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "a"(VMWARE_HYPERVISOR_MAGIC), \
- "c"(VMWARE_CMD_##cmd), \
- "d"(VMWARE_HYPERVISOR_PORT), "b"(UINT_MAX) : \
- "memory")
-
-#define VMWARE_VMCALL(cmd, eax, ebx, ecx, edx) \
- __asm__("vmcall" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "a"(VMWARE_HYPERVISOR_MAGIC), \
- "c"(VMWARE_CMD_##cmd), \
- "d"(0), "b"(UINT_MAX) : \
- "memory")
-
-#define VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx) \
- __asm__("vmmcall" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "a"(VMWARE_HYPERVISOR_MAGIC), \
- "c"(VMWARE_CMD_##cmd), \
- "d"(0), "b"(UINT_MAX) : \
- "memory")
-
-#define VMWARE_CMD(cmd, eax, ebx, ecx, edx) do { \
- switch (vmware_hypercall_mode) { \
- case CPUID_VMWARE_FEATURES_ECX_VMCALL: \
- VMWARE_VMCALL(cmd, eax, ebx, ecx, edx); \
- break; \
- case CPUID_VMWARE_FEATURES_ECX_VMMCALL: \
- VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx); \
- break; \
- default: \
- VMWARE_PORT(cmd, eax, ebx, ecx, edx); \
- break; \
- } \
- } while (0)
-
struct vmware_steal_time {
union {
- uint64_t clock; /* stolen time counter in units of vtsc */
+ u64 clock; /* stolen time counter in units of vtsc */
struct {
/* only for little-endian */
- uint32_t clock_low;
- uint32_t clock_high;
+ u32 clock_low;
+ u32 clock_high;
};
};
- uint64_t reserved[7];
+ u64 reserved[7];
};
static unsigned long vmware_tsc_khz __ro_after_init;
static u8 vmware_hypercall_mode __ro_after_init;
+unsigned long vmware_hypercall_slow(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ unsigned long out0, rbx, rcx, rdx, rsi, rdi;
+
+ switch (vmware_hypercall_mode) {
+ case CPUID_VMWARE_FEATURES_ECX_VMCALL:
+ asm_inline volatile ("vmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ case CPUID_VMWARE_FEATURES_ECX_VMMCALL:
+ asm_inline volatile ("vmmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ default:
+ asm_inline volatile ("movw %[port], %%dx; inl (%%dx), %%eax"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ }
+
+ if (out1)
+ *out1 = rbx;
+ if (out2)
+ *out2 = rcx;
+ if (out3)
+ *out3 = rdx;
+ if (out4)
+ *out4 = rsi;
+ if (out5)
+ *out5 = rdi;
+
+ return out0;
+}
+
static inline int __vmware_platform(void)
{
- uint32_t eax, ebx, ecx, edx;
- VMWARE_CMD(GETVERSION, eax, ebx, ecx, edx);
- return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
+ u32 eax, ebx, ecx;
+
+ eax = vmware_hypercall3(VMWARE_CMD_GETVERSION, 0, &ebx, &ecx);
+ return eax != UINT_MAX && ebx == VMWARE_HYPERVISOR_MAGIC;
}
static unsigned long vmware_get_tsc_khz(void)
@@ -166,21 +183,12 @@ static void __init vmware_cyc2ns_setup(void)
pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset);
}
-static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2)
+static int vmware_cmd_stealclock(u32 addr_hi, u32 addr_lo)
{
- uint32_t result, info;
-
- asm volatile (VMWARE_HYPERCALL :
- "=a"(result),
- "=c"(info) :
- "a"(VMWARE_HYPERVISOR_MAGIC),
- "b"(0),
- "c"(VMWARE_CMD_STEALCLOCK),
- "d"(0),
- "S"(arg1),
- "D"(arg2) :
- "memory");
- return result;
+ u32 info;
+
+ return vmware_hypercall5(VMWARE_CMD_STEALCLOCK, 0, 0, addr_hi, addr_lo,
+ &info);
}
static bool stealclock_enable(phys_addr_t pa)
@@ -215,15 +223,15 @@ static bool vmware_is_stealclock_available(void)
* Return:
* The steal clock reading in ns.
*/
-static uint64_t vmware_steal_clock(int cpu)
+static u64 vmware_steal_clock(int cpu)
{
struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu);
- uint64_t clock;
+ u64 clock;
if (IS_ENABLED(CONFIG_64BIT))
clock = READ_ONCE(steal->clock);
else {
- uint32_t initial_high, low, high;
+ u32 initial_high, low, high;
do {
initial_high = READ_ONCE(steal->clock_high);
@@ -235,7 +243,7 @@ static uint64_t vmware_steal_clock(int cpu)
high = READ_ONCE(steal->clock_high);
} while (initial_high != high);
- clock = ((uint64_t)high << 32) | low;
+ clock = ((u64)high << 32) | low;
}
return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul,
@@ -389,13 +397,13 @@ static void __init vmware_set_capabilities(void)
static void __init vmware_platform_setup(void)
{
- uint32_t eax, ebx, ecx, edx;
- uint64_t lpj, tsc_khz;
+ u32 eax, ebx, ecx;
+ u64 lpj, tsc_khz;
- VMWARE_CMD(GETHZ, eax, ebx, ecx, edx);
+ eax = vmware_hypercall3(VMWARE_CMD_GETHZ, UINT_MAX, &ebx, &ecx);
if (ebx != UINT_MAX) {
- lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
+ lpj = tsc_khz = eax | (((u64)ebx) << 32);
do_div(tsc_khz, 1000);
WARN_ON(tsc_khz >> 32);
pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
@@ -446,7 +454,7 @@ static u8 __init vmware_select_hypercall(void)
* If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode
* intentionally defaults to 0.
*/
-static uint32_t __init vmware_platform(void)
+static u32 __init vmware_platform(void)
{
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
unsigned int eax;
@@ -474,12 +482,65 @@ static uint32_t __init vmware_platform(void)
/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
static bool __init vmware_legacy_x2apic_available(void)
{
- uint32_t eax, ebx, ecx, edx;
- VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx);
- return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) &&
- (eax & BIT(VMWARE_CMD_LEGACY_X2APIC));
+ u32 eax;
+
+ eax = vmware_hypercall1(VMWARE_CMD_GETVCPU_INFO, 0);
+ return !(eax & GETVCPU_INFO_VCPU_RESERVED) &&
+ (eax & GETVCPU_INFO_LEGACY_X2APIC);
}
+#ifdef CONFIG_INTEL_TDX_GUEST
+/*
+ * TDCALL[TDG.VP.VMCALL] uses %rax (arg0) and %rcx (arg2). Therefore,
+ * we remap those registers to %r12 and %r13, respectively.
+ */
+unsigned long vmware_tdx_hypercall(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ struct tdx_module_args args = {};
+
+ if (!hypervisor_is_type(X86_HYPER_VMWARE)) {
+ pr_warn_once("Incorrect usage\n");
+ return ULONG_MAX;
+ }
+
+ if (cmd & ~VMWARE_CMD_MASK) {
+ pr_warn_once("Out of range command %lx\n", cmd);
+ return ULONG_MAX;
+ }
+
+ args.rbx = in1;
+ args.rdx = in3;
+ args.rsi = in4;
+ args.rdi = in5;
+ args.r10 = VMWARE_TDX_VENDOR_LEAF;
+ args.r11 = VMWARE_TDX_HCALL_FUNC;
+ args.r12 = VMWARE_HYPERVISOR_MAGIC;
+ args.r13 = cmd;
+ /* CPL */
+ args.r15 = 0;
+
+ __tdx_hypercall(&args);
+
+ if (out1)
+ *out1 = args.rbx;
+ if (out2)
+ *out2 = args.r13;
+ if (out3)
+ *out3 = args.rdx;
+ if (out4)
+ *out4 = args.rsi;
+ if (out5)
+ *out5 = args.rdi;
+
+ return args.r12;
+}
+EXPORT_SYMBOL_GPL(vmware_tdx_hypercall);
+#endif
+
#ifdef CONFIG_AMD_MEM_ENCRYPT
static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb,
struct pt_regs *regs)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index e74d0c4286c1..340af8155658 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -128,6 +128,18 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
+
+ /*
+ * Non-crash kexec calls enc_kexec_begin() while scheduling is still
+ * active. This allows the callback to wait until all in-flight
+ * shared<->private conversions are complete. In a crash scenario,
+ * enc_kexec_begin() gets called after all but one CPU have been shut
+ * down and interrupts have been disabled. This allows the callback to
+ * detect a race with the conversion and report it.
+ */
+ x86_platform.guest.enc_kexec_begin();
+ x86_platform.guest.enc_kexec_finish();
+
crash_save_cpu(regs, safe_smp_processor_id());
}
@@ -402,20 +414,26 @@ int crash_load_segments(struct kimage *image)
#undef pr_fmt
#define pr_fmt(fmt) "crash hp: " fmt
-/* These functions provide the value for the sysfs crash_hotplug nodes */
-#ifdef CONFIG_HOTPLUG_CPU
-int arch_crash_hotplug_cpu_support(void)
+int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags)
{
- return crash_check_update_elfcorehdr();
-}
-#endif
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_crash_hotplug_memory_support(void)
-{
- return crash_check_update_elfcorehdr();
-}
+#ifdef CONFIG_KEXEC_FILE
+ if (image->file_mode)
+ return 1;
#endif
+ /*
+ * Initially, crash hotplug support for kexec_load was added
+ * with the KEXEC_UPDATE_ELFCOREHDR flag. Later, this
+ * functionality was expanded to accommodate multiple kexec
+ * segment updates, leading to the introduction of the
+ * KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag bit. Consequently,
+ * when the kexec tool sends either of these flags, it indicates
+ * that the required kexec segment (elfcorehdr) is excluded from
+ * the SHA calculation.
+ */
+ return (kexec_flags & KEXEC_UPDATE_ELFCOREHDR ||
+ kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT);
+}
unsigned int arch_crash_get_elfcorehdr_size(void)
{
@@ -432,10 +450,12 @@ unsigned int arch_crash_get_elfcorehdr_size(void)
/**
* arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
* @image: a pointer to kexec_crash_image
+ * @arg: struct memory_notify handler for memory hotplug case and
+ * NULL for CPU hotplug case.
*
* Prepare the new elfcorehdr and replace the existing elfcorehdr.
*/
-void arch_crash_handle_hotplug_event(struct kimage *image)
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg)
{
void *elfbuf = NULL, *old_elfcorehdr;
unsigned long nr_mem_ranges;
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 003e0298f46a..64280879c68c 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -24,6 +24,7 @@
#include <asm/pci_x86.h>
#include <asm/setup.h>
#include <asm/i8259.h>
+#include <asm/numa.h>
#include <asm/prom.h>
__initdata u64 initial_dtb;
@@ -82,7 +83,7 @@ static int x86_of_pci_irq_enable(struct pci_dev *dev)
ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (ret)
- return ret;
+ return pcibios_err_to_errno(ret);
if (!pin)
return 0;
@@ -137,6 +138,7 @@ static void __init dtb_cpu_setup(void)
continue;
}
topology_register_apic(apic_id, CPU_ACPIID_INVALID, true);
+ set_apicid_to_node(apic_id, of_node_to_nid(dn));
}
}
@@ -277,9 +279,18 @@ static void __init dtb_apic_setup(void)
dtb_ioapic_setup();
}
-#ifdef CONFIG_OF_EARLY_FLATTREE
+static void __init x86_dtb_parse_smp_config(void)
+{
+ if (!of_have_populated_dt())
+ return;
+
+ dtb_setup_hpet();
+ dtb_apic_setup();
+}
+
void __init x86_flattree_get_config(void)
{
+#ifdef CONFIG_OF_EARLY_FLATTREE
u32 size, map_len;
void *dt;
@@ -301,14 +312,7 @@ void __init x86_flattree_get_config(void)
if (initial_dtb)
early_memunmap(dt, map_len);
-}
#endif
-
-void __init x86_dtb_parse_smp_config(void)
-{
- if (!of_have_populated_dt())
- return;
-
- dtb_setup_hpet();
- dtb_apic_setup();
+ if (of_have_populated_dt())
+ x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config;
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 44a91ef5a23b..a7d562697e50 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -405,8 +405,8 @@ static void __die_header(const char *str, struct pt_regs *regs, long err)
pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
printk(KERN_DEFAULT
- "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
- pr,
+ "Oops: %s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff,
+ ++die_counter, pr,
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 68b09f718f10..4893d30ce438 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -828,7 +828,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
/*
* Find the highest page frame number we have available
*/
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
+static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn)
{
int i;
unsigned long last_pfn = 0;
@@ -839,7 +839,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type
unsigned long start_pfn;
unsigned long end_pfn;
- if (entry->type != type)
+ if (entry->type != E820_TYPE_RAM &&
+ entry->type != E820_TYPE_ACPI)
continue;
start_pfn = entry->addr >> PAGE_SHIFT;
@@ -865,12 +866,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type
unsigned long __init e820__end_of_ram_pfn(void)
{
- return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
+ return e820__end_ram_pfn(MAX_ARCH_PFN);
}
unsigned long __init e820__end_of_low_ram_pfn(void)
{
- return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
+ return e820__end_ram_pfn(1UL << (32 - PAGE_SHIFT));
}
static void __init early_panic(char *msg)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 59f4aefc6bc1..29d1f9104e94 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -17,8 +17,8 @@
#include <linux/bcma/bcma.h>
#include <linux/bcma/bcma_regs.h>
#include <linux/platform_data/x86/apple.h>
-#include <drm/i915_drm.h>
-#include <drm/i915_pciids.h>
+#include <drm/intel/i915_drm.h>
+#include <drm/intel/i915_pciids.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/io_apic.h>
@@ -518,47 +518,46 @@ static const struct intel_early_ops gen11_early_ops __initconst = {
/* Intel integrated GPUs for which we need to reserve "stolen memory" */
static const struct pci_device_id intel_early_ids[] __initconst = {
- INTEL_I830_IDS(&i830_early_ops),
- INTEL_I845G_IDS(&i845_early_ops),
- INTEL_I85X_IDS(&i85x_early_ops),
- INTEL_I865G_IDS(&i865_early_ops),
- INTEL_I915G_IDS(&gen3_early_ops),
- INTEL_I915GM_IDS(&gen3_early_ops),
- INTEL_I945G_IDS(&gen3_early_ops),
- INTEL_I945GM_IDS(&gen3_early_ops),
- INTEL_VLV_IDS(&gen6_early_ops),
- INTEL_PINEVIEW_G_IDS(&gen3_early_ops),
- INTEL_PINEVIEW_M_IDS(&gen3_early_ops),
- INTEL_I965G_IDS(&gen3_early_ops),
- INTEL_G33_IDS(&gen3_early_ops),
- INTEL_I965GM_IDS(&gen3_early_ops),
- INTEL_GM45_IDS(&gen3_early_ops),
- INTEL_G45_IDS(&gen3_early_ops),
- INTEL_IRONLAKE_D_IDS(&gen3_early_ops),
- INTEL_IRONLAKE_M_IDS(&gen3_early_ops),
- INTEL_SNB_D_IDS(&gen6_early_ops),
- INTEL_SNB_M_IDS(&gen6_early_ops),
- INTEL_IVB_M_IDS(&gen6_early_ops),
- INTEL_IVB_D_IDS(&gen6_early_ops),
- INTEL_HSW_IDS(&gen6_early_ops),
- INTEL_BDW_IDS(&gen8_early_ops),
- INTEL_CHV_IDS(&chv_early_ops),
- INTEL_SKL_IDS(&gen9_early_ops),
- INTEL_BXT_IDS(&gen9_early_ops),
- INTEL_KBL_IDS(&gen9_early_ops),
- INTEL_CFL_IDS(&gen9_early_ops),
- INTEL_GLK_IDS(&gen9_early_ops),
- INTEL_CNL_IDS(&gen9_early_ops),
- INTEL_ICL_11_IDS(&gen11_early_ops),
- INTEL_EHL_IDS(&gen11_early_ops),
- INTEL_JSL_IDS(&gen11_early_ops),
- INTEL_TGL_12_IDS(&gen11_early_ops),
- INTEL_RKL_IDS(&gen11_early_ops),
- INTEL_ADLS_IDS(&gen11_early_ops),
- INTEL_ADLP_IDS(&gen11_early_ops),
- INTEL_ADLN_IDS(&gen11_early_ops),
- INTEL_RPLS_IDS(&gen11_early_ops),
- INTEL_RPLP_IDS(&gen11_early_ops),
+ INTEL_I830_IDS(INTEL_VGA_DEVICE, &i830_early_ops),
+ INTEL_I845G_IDS(INTEL_VGA_DEVICE, &i845_early_ops),
+ INTEL_I85X_IDS(INTEL_VGA_DEVICE, &i85x_early_ops),
+ INTEL_I865G_IDS(INTEL_VGA_DEVICE, &i865_early_ops),
+ INTEL_I915G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I915GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I945G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I945GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_VLV_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_PNV_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I965G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_G33_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I965GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_GM45_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_G45_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_ILK_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_SNB_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_IVB_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_HSW_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_BDW_IDS(INTEL_VGA_DEVICE, &gen8_early_ops),
+ INTEL_CHV_IDS(INTEL_VGA_DEVICE, &chv_early_ops),
+ INTEL_SKL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_BXT_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_KBL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_CFL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_WHL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_CML_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_GLK_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_CNL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_ICL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_EHL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_JSL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_TGL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RKL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_ADLS_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_ADLP_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_ADLN_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RPLS_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RPLU_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RPLP_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
};
struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
index 53935b4d62e3..9535a6507db7 100644
--- a/arch/x86/kernel/eisa.c
+++ b/arch/x86/kernel/eisa.c
@@ -11,15 +11,15 @@
static __init int eisa_bus_probe(void)
{
- void __iomem *p;
+ u32 *p;
if ((xen_pv_domain() && !xen_initial_domain()) || cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
return 0;
- p = ioremap(0x0FFFD9, 4);
- if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24))
+ p = memremap(0x0FFFD9, 4, MEMREMAP_WB);
+ if (p && *p == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24))
EISA_bus = 1;
- iounmap(p);
+ memunmap(p);
return 0;
}
subsys_initcall(eisa_bus_probe);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 520deb411a70..1209c7aebb21 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -145,8 +145,8 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
asm volatile(
"fnclex\n\t"
"emms\n\t"
- "fildl %P[addr]" /* set F?P to defined value */
- : : [addr] "m" (fpstate));
+ "fildl %[addr]" /* set F?P to defined value */
+ : : [addr] "m" (*fpstate));
}
if (use_xsave()) {
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 247f2225aa9f..1065ab995305 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -64,6 +64,16 @@ setfx:
}
/*
+ * Update the value of PKRU register that was already pushed onto the signal frame.
+ */
+static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru)
+{
+ if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE)))
+ return 0;
+ return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU));
+}
+
+/*
* Signal frame handlers.
*/
static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
@@ -156,10 +166,17 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
return !err;
}
-static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
+static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru)
{
- if (use_xsave())
- return xsave_to_user_sigframe(buf);
+ int err = 0;
+
+ if (use_xsave()) {
+ err = xsave_to_user_sigframe(buf);
+ if (!err)
+ err = update_pkru_in_sigframe(buf, pkru);
+ return err;
+ }
+
if (use_fxsr())
return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
else
@@ -185,7 +202,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
* For [f]xsave state, update the SW reserved fields in the [f]xsave frame
* indicating the absence/presence of the extended state to the user.
*/
-bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru)
{
struct task_struct *tsk = current;
struct fpstate *fpstate = tsk->thread.fpu.fpstate;
@@ -228,7 +245,7 @@ retry:
fpregs_restore_userregs();
pagefault_disable();
- ret = copy_fpregs_to_sigframe(buf_fx);
+ ret = copy_fpregs_to_sigframe(buf_fx, pkru);
pagefault_enable();
fpregs_unlock();
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 33a214b1a4ce..22abb5ee0cf2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -13,6 +13,7 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>
+#include <linux/coredump.h>
#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
@@ -23,6 +24,8 @@
#include <asm/prctl.h>
#include <asm/elf.h>
+#include <uapi/asm/elf.h>
+
#include "context.h"
#include "internal.h"
#include "legacy.h"
@@ -788,6 +791,9 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
goto out_disable;
}
+ fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
+ XFEATURE_MASK_INDEPENDENT;
+
/*
* Clear XSAVE features that are disabled in the normal CPUID.
*/
@@ -991,6 +997,20 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
return __raw_xsave_addr(xsave, xfeature_nr);
}
+EXPORT_SYMBOL_GPL(get_xsave_addr);
+
+/*
+ * Given an xstate feature nr, calculate where in the xsave buffer the state is.
+ * The xsave buffer should be in standard format, not compacted (e.g. user mode
+ * signal frames).
+ */
+void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
+{
+ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+ return NULL;
+
+ return (void __user *)xsave + xstate_offsets[xfeature_nr];
+}
#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -1434,8 +1454,8 @@ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
return rstor;
/*
- * XSAVE(S): clone(), fpu_swap_kvm_fpu()
- * XRSTORS(S): fpu_swap_kvm_fpu()
+ * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
+ * XRSTORS(S): fpu_swap_kvm_fpstate()
*/
/*
@@ -1837,3 +1857,89 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */
+
+#ifdef CONFIG_COREDUMP
+static const char owner_name[] = "LINUX";
+
+/*
+ * Dump type, size, offset and flag values for every xfeature that is present.
+ */
+static int dump_xsave_layout_desc(struct coredump_params *cprm)
+{
+ int num_records = 0;
+ int i;
+
+ for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
+ struct x86_xfeat_component xc = {
+ .type = i,
+ .size = xstate_sizes[i],
+ .offset = xstate_offsets[i],
+ /* reserved for future use */
+ .flags = 0,
+ };
+
+ if (!dump_emit(cprm, &xc, sizeof(xc)))
+ return 0;
+
+ num_records++;
+ }
+ return num_records;
+}
+
+static u32 get_xsave_desc_size(void)
+{
+ u32 cnt = 0;
+ u32 i;
+
+ for_each_extended_xfeature(i, fpu_user_cfg.max_features)
+ cnt++;
+
+ return cnt * (sizeof(struct x86_xfeat_component));
+}
+
+int elf_coredump_extra_notes_write(struct coredump_params *cprm)
+{
+ int num_records = 0;
+ struct elf_note en;
+
+ if (!fpu_user_cfg.max_features)
+ return 0;
+
+ en.n_namesz = sizeof(owner_name);
+ en.n_descsz = get_xsave_desc_size();
+ en.n_type = NT_X86_XSAVE_LAYOUT;
+
+ if (!dump_emit(cprm, &en, sizeof(en)))
+ return 1;
+ if (!dump_emit(cprm, owner_name, en.n_namesz))
+ return 1;
+ if (!dump_align(cprm, 4))
+ return 1;
+
+ num_records = dump_xsave_layout_desc(cprm);
+ if (!num_records)
+ return 1;
+
+ /* Total size should be equal to the number of records */
+ if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
+ return 1;
+
+ return 0;
+}
+
+int elf_coredump_extra_notes_size(void)
+{
+ int size;
+
+ if (!fpu_user_cfg.max_features)
+ return 0;
+
+ /* .note header */
+ size = sizeof(struct elf_note);
+ /* Name plus alignment to 4 bytes */
+ size += roundup(sizeof(owner_name), 4);
+ size += get_xsave_desc_size();
+
+ return size;
+}
+#endif /* CONFIG_COREDUMP */
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 19ca623ffa2a..0b86a5002c84 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -54,7 +54,7 @@ extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void
extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system_xstate(unsigned int legacy_size);
-extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr);
static inline u64 xfeatures_mask_supervisor(void)
{
@@ -64,9 +64,9 @@ static inline u64 xfeatures_mask_supervisor(void)
static inline u64 xfeatures_mask_independent(void)
{
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
- return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
+ return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR;
- return XFEATURE_MASK_INDEPENDENT;
+ return fpu_kernel_cfg.independent_features;
}
/* XSAVE/XRSTOR wrapper functions */
@@ -108,21 +108,17 @@ static inline u64 xfeatures_mask_independent(void)
* Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
* supports modified optimization which is not supported by XSAVE.
*
- * We use XSAVE as a fallback.
- *
- * The 661 label is defined in the ALTERNATIVE* macros as the address of the
- * original instruction which gets replaced. We need to use it here as the
- * address of the instruction where we might get an exception at.
+ * Use XSAVE as a fallback.
*/
#define XSTATE_XSAVE(st, lmask, hmask, err) \
- asm volatile(ALTERNATIVE_3(XSAVE, \
+ asm volatile("1: " ALTERNATIVE_3(XSAVE, \
XSAVEOPT, X86_FEATURE_XSAVEOPT, \
XSAVEC, X86_FEATURE_XSAVEC, \
XSAVES, X86_FEATURE_XSAVES) \
"\n" \
"xor %[err], %[err]\n" \
"3:\n" \
- _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
+ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
: [err] "=r" (err) \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
@@ -132,11 +128,11 @@ static inline u64 xfeatures_mask_independent(void)
* XSAVE area format.
*/
#define XSTATE_XRESTORE(st, lmask, hmask) \
- asm volatile(ALTERNATIVE(XRSTOR, \
+ asm volatile("1: " ALTERNATIVE(XRSTOR, \
XRSTORS, X86_FEATURE_XSAVES) \
"\n" \
"3:\n" \
- _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \
+ _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE) \
: \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c
index 4bcd8791ad96..8d32c3f48abc 100644
--- a/arch/x86/kernel/fred.c
+++ b/arch/x86/kernel/fred.c
@@ -21,17 +21,53 @@
#define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector)))
+DEFINE_PER_CPU(unsigned long, fred_rsp0);
+EXPORT_PER_CPU_SYMBOL(fred_rsp0);
+
void cpu_init_fred_exceptions(void)
{
/* When FRED is enabled by default, remove this log message */
pr_info("Initialize FRED on CPU%d\n", smp_processor_id());
+ /*
+ * If a kernel event is delivered before a CPU goes to user level for
+ * the first time, its SS is NULL thus NULL is pushed into the SS field
+ * of the FRED stack frame. But before ERETS is executed, the CPU may
+ * context switch to another task and go to user level. Then when the
+ * CPU comes back to kernel mode, SS is changed to __KERNEL_DS. Later
+ * when ERETS is executed to return from the kernel event handler, a #GP
+ * fault is generated because SS doesn't match the SS saved in the FRED
+ * stack frame.
+ *
+ * Initialize SS to __KERNEL_DS when enabling FRED to avoid such #GPs.
+ */
+ loadsegment(ss, __KERNEL_DS);
+
wrmsrl(MSR_IA32_FRED_CONFIG,
/* Reserve for CALL emulation */
FRED_CONFIG_REDZONE |
FRED_CONFIG_INT_STKLVL(0) |
FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user));
+ wrmsrl(MSR_IA32_FRED_STKLVLS, 0);
+ wrmsrl(MSR_IA32_FRED_RSP0, 0);
+ wrmsrl(MSR_IA32_FRED_RSP1, 0);
+ wrmsrl(MSR_IA32_FRED_RSP2, 0);
+ wrmsrl(MSR_IA32_FRED_RSP3, 0);
+
+ /* Enable FRED */
+ cr4_set_bits(X86_CR4_FRED);
+ /* Any further IDT use is a bug */
+ idt_invalidate();
+
+ /* Use int $0x80 for 32-bit system calls in FRED mode */
+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+}
+
+/* Must be called after setup_cpu_entry_areas() */
+void cpu_init_fred_rsps(void)
+{
/*
* The purpose of separate stacks for NMI, #DB and #MC *in the kernel*
* (remember that user space faults are always taken on stack level 0)
@@ -47,13 +83,4 @@ void cpu_init_fred_exceptions(void)
wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB));
wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI));
wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF));
-
- /* Enable FRED */
- cr4_set_bits(X86_CR4_FRED);
- /* Any further IDT use is a bug */
- idt_invalidate();
-
- /* Use int $0x80 for 32-bit system calls in FRED mode */
- setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
- setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 70139d9d2e01..8da0e66ca22d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -25,6 +25,7 @@
#include <linux/memory.h>
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
+#include <linux/execmem.h>
#include <trace/syscall.h>
@@ -260,25 +261,14 @@ void arch_ftrace_update_code(int command)
/* Currently only x86_64 supports dynamic trampolines */
#ifdef CONFIG_X86_64
-#ifdef CONFIG_MODULES
-#include <linux/moduleloader.h>
-/* Module allocation simplifies allocating memory for code */
static inline void *alloc_tramp(unsigned long size)
{
- return module_alloc(size);
+ return execmem_alloc(EXECMEM_FTRACE, size);
}
static inline void tramp_free(void *tramp)
{
- module_memfree(tramp);
+ execmem_free(tramp);
}
-#else
-/* Trampolines can only be created if modules are supported */
-static inline void *alloc_tramp(unsigned long size)
-{
- return NULL;
-}
-static inline void tramp_free(void *tramp) { }
-#endif
/* Defined as markers to the end of the ftrace default trampolines */
extern void ftrace_regs_caller_end(void);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index a817ed0724d1..4b9d4557fc94 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -559,10 +559,11 @@ void early_setup_idt(void)
*/
void __head startup_64_setup_gdt_idt(void)
{
+ struct desc_struct *gdt = (void *)(__force unsigned long)init_per_cpu_var(gdt_page.gdt);
void *handler = NULL;
struct desc_ptr startup_gdt_descr = {
- .address = (unsigned long)&RIP_REL_REF(init_per_cpu_var(gdt_page.gdt)),
+ .address = (unsigned long)&RIP_REL_REF(*gdt),
.size = GDT_SIZE - 1,
};
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b50f3641c4d6..2e42056d2306 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -44,9 +44,6 @@
#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
-
-#define SIZEOF_PTREGS 17*4
-
/*
* Worst-case size of the kernel mapping we need to make:
* a relocatable kernel can live anywhere in lowmem, so we need to be able
@@ -488,19 +485,13 @@ SYM_DATA_END(initial_page_table)
.data
.balign 4
-/*
- * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
- * reliably detect the end of the stack.
- */
-SYM_DATA(initial_stack,
- .long init_thread_union + THREAD_SIZE -
- SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING)
+SYM_DATA(initial_stack, .long __top_init_kernel_stack)
__INITRODATA
int_msg:
.asciz "Unknown interrupt or fault at: %p %p %p\n"
-#include "../../x86/xen/xen-head.S"
+#include "../xen/xen-head.S"
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d8198fbd70e5..16752b8dfa89 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -32,13 +32,6 @@
* We are not able to switch in one step to the final KERNEL ADDRESS SPACE
* because we need identity-mapped pages.
*/
-#define l4_index(x) (((x) >> 39) & 511)
-#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-
-L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
-L4_START_KERNEL = l4_index(__START_KERNEL_map)
-
-L3_START_KERNEL = pud_index(__START_KERNEL_map)
__HEAD
.code64
@@ -66,7 +59,7 @@ SYM_CODE_START_NOALIGN(startup_64)
mov %rsi, %r15
/* Set up the stack for verify_cpu() */
- leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp
+ leaq __top_init_kernel_stack(%rip), %rsp
/* Setup GSBASE to allow stack canary access for C code */
movl $MSR_GS_BASE, %ecx
@@ -577,9 +570,6 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb)
SYM_CODE_END(vc_no_ghcb)
#endif
-#define SYM_DATA_START_PAGE_ALIGNED(name) \
- SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
-
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
* Each PGD needs to be 8k long and 8k aligned. We do not
@@ -601,14 +591,6 @@ SYM_CODE_END(vc_no_ghcb)
#define PTI_USER_PGD_FILL 0
#endif
-/* Automate the creation of 1 to 1 mapping pmd entries */
-#define PMDS(START, PERM, COUNT) \
- i = 0 ; \
- .rept (COUNT) ; \
- .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
- i = i + 1 ; \
- .endr
-
__INITDATA
.balign 4
@@ -708,8 +690,6 @@ SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt)
.endr
SYM_DATA_END(level1_fixmap_pgt)
-#undef PMDS
-
.data
.align 16
@@ -720,7 +700,7 @@ SYM_DATA(smpboot_control, .long 0)
SYM_DATA(phys_base, .quad 0x0)
EXPORT_SYMBOL(phys_base)
-#include "../../x86/xen/xen-head.S"
+#include "../xen/xen-head.S"
__PAGE_ALIGNED_BSS
SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2b7999a1a50a..80e262bb627f 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -8,6 +8,7 @@
#include <linux/timex.h>
#include <linux/i8253.h>
+#include <asm/hypervisor.h>
#include <asm/apic.h>
#include <asm/hpet.h>
#include <asm/time.h>
@@ -39,9 +40,15 @@ static bool __init use_pit(void)
bool __init pit_timer_init(void)
{
- if (!use_pit())
+ if (!use_pit()) {
+ /*
+ * Don't just ignore the PIT. Ensure it's stopped, because
+ * VMMs otherwise steal CPU time just to pointlessly waggle
+ * the (masked) IRQ.
+ */
+ clockevent_i8253_disable();
return false;
-
+ }
clockevent_i8253_init(true);
global_clock_event = &i8253_clockevent;
return true;
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index fc37c8d83daf..f445bec516a0 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = {
# endif
INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt),
INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt),
+# ifdef CONFIG_X86_POSTED_MSI
+ INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification),
+# endif
#endif
};
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 35fde0107901..385e3a5fc304 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -22,6 +22,8 @@
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/thermal.h>
+#include <asm/posted_intr.h>
+#include <asm/irq_remapping.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
@@ -182,6 +184,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
irq_stats(j)->kvm_posted_intr_wakeup_ipis);
seq_puts(p, " Posted-interrupt wakeup event\n");
#endif
+#ifdef CONFIG_X86_POSTED_MSI
+ seq_printf(p, "%*s: ", prec, "PMN");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->posted_msi_notification_count);
+ seq_puts(p, " Posted MSI notification event\n");
+#endif
return 0;
}
@@ -240,24 +249,16 @@ static __always_inline void handle_irq(struct irq_desc *desc,
__handle_irq(desc, regs);
}
-/*
- * common_interrupt() handles all normal device IRQ's (the special SMP
- * cross-CPU interrupts have their own entry points).
- */
-DEFINE_IDTENTRY_IRQ(common_interrupt)
+static __always_inline int call_irq_handler(int vector, struct pt_regs *regs)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
struct irq_desc *desc;
-
- /* entry code tells RCU that we're not quiescent. Check it. */
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+ int ret = 0;
desc = __this_cpu_read(vector_irq[vector]);
if (likely(!IS_ERR_OR_NULL(desc))) {
handle_irq(desc, regs);
} else {
- apic_eoi();
-
+ ret = -EINVAL;
if (desc == VECTOR_UNUSED) {
pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
__func__, smp_processor_id(),
@@ -267,6 +268,23 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
}
}
+ return ret;
+}
+
+/*
+ * common_interrupt() handles all normal device IRQ's (the special SMP
+ * cross-CPU interrupts have their own entry points).
+ */
+DEFINE_IDTENTRY_IRQ(common_interrupt)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /* entry code tells RCU that we're not quiescent. Check it. */
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+
+ if (unlikely(call_irq_handler(vector, regs)))
+ apic_eoi();
+
set_irq_regs(old_regs);
}
@@ -334,12 +352,139 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
}
#endif
+#ifdef CONFIG_X86_POSTED_MSI
+
+/* Posted Interrupt Descriptors for coalesced MSIs to be posted */
+DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
+
+void intel_posted_msi_init(void)
+{
+ u32 destination;
+ u32 apic_id;
+
+ this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR);
+
+ /*
+ * APIC destination ID is stored in bit 8:15 while in XAPIC mode.
+ * VT-d spec. CH 9.11
+ */
+ apic_id = this_cpu_read(x86_cpu_to_apicid);
+ destination = x2apic_enabled() ? apic_id : apic_id << 8;
+ this_cpu_write(posted_msi_pi_desc.ndst, destination);
+}
+
+/*
+ * De-multiplexing posted interrupts is on the performance path, the code
+ * below is written to optimize the cache performance based on the following
+ * considerations:
+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
+ * accessed by both CPU and IOMMU.
+ * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
+ * for checking and clearing posted interrupt request (PIR), a 256 bit field
+ * within the PID.
+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
+ * line when posting interrupts and setting control bits.
+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
+ * cache line. The cache line states after each operation are as follows:
+ * CPU IOMMU PID Cache line state
+ * ---------------------------------------------------------------
+ *...read64 exclusive
+ *...lock xchg64 modified
+ *... post/atomic swap invalid
+ *...-------------------------------------------------------------
+ *
+ * To reduce L1 data cache miss, it is important to avoid contention with
+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
+ * to dispatch interrupt handlers.
+ *
+ * In addition, the code is trying to keep the cache line state consistent
+ * as much as possible. e.g. when making a copy and clearing the PIR
+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
+ * read, read, read, read, xchg, xchg, xchg, xchg
+ * instead of:
+ * read, xchg, read, xchg, read, xchg, read, xchg
+ */
+static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
+{
+ int i, vec = FIRST_EXTERNAL_VECTOR;
+ unsigned long pir_copy[4];
+ bool handled = false;
+
+ for (i = 0; i < 4; i++)
+ pir_copy[i] = pir[i];
+
+ for (i = 0; i < 4; i++) {
+ if (!pir_copy[i])
+ continue;
+
+ pir_copy[i] = arch_xchg(&pir[i], 0);
+ handled = true;
+ }
+
+ if (handled) {
+ for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
+ call_irq_handler(vec, regs);
+ }
+
+ return handled;
+}
+
+/*
+ * Performance data shows that 3 is good enough to harvest 90+% of the benefit
+ * on high IRQ rate workload.
+ */
+#define MAX_POSTED_MSI_COALESCING_LOOP 3
+
+/*
+ * For MSIs that are delivered as posted interrupts, the CPU notifications
+ * can be coalesced if the MSIs arrive in high frequency bursts.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ struct pi_desc *pid;
+ int i = 0;
+
+ pid = this_cpu_ptr(&posted_msi_pi_desc);
+
+ inc_irq_stat(posted_msi_notification_count);
+ irq_enter();
+
+ /*
+ * Max coalescing count includes the extra round of handle_pending_pir
+ * after clearing the outstanding notification bit. Hence, at most
+ * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
+ */
+ while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
+ if (!handle_pending_pir(pid->pir64, regs))
+ break;
+ }
+
+ /*
+ * Clear outstanding notification bit to allow new IRQ notifications,
+ * do this last to maximize the window of interrupt coalescing.
+ */
+ pi_clear_on(pid);
+
+ /*
+ * There could be a race of PI notification and the clearing of ON bit,
+ * process PIR bits one last time such that handling the new interrupts
+ * are not delayed until the next IRQ.
+ */
+ handle_pending_pir(pid->pir64, regs);
+
+ apic_eoi();
+ irq_exit();
+ set_irq_regs(old_regs);
+}
+#endif /* X86_POSTED_MSI */
#ifdef CONFIG_HOTPLUG_CPU
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
{
- unsigned int irr, vector;
+ unsigned int vector;
struct irq_desc *desc;
struct irq_data *data;
struct irq_chip *chip;
@@ -366,8 +511,7 @@ void fixup_irqs(void)
if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
continue;
- irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
- if (irr & (1 << (vector % 32))) {
+ if (is_vector_pending(vector)) {
desc = __this_cpu_read(vector_irq[vector]);
raw_spin_lock(&desc->lock);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index fe0c859873d1..ade0043ce56e 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,6 +18,7 @@
#include <linux/uaccess.h>
#include <linux/smp.h>
#include <linux/sched/task_stack.h>
+#include <linux/vmalloc.h>
#include <asm/cpu_entry_area.h>
#include <asm/softirq_stack.h>
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 9a7c03d47861..51b805c727fc 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -38,7 +38,7 @@ static bool __read_mostly sched_itmt_capable;
*/
unsigned int __read_mostly sysctl_sched_itmt_enabled;
-static int sched_itmt_update_handler(struct ctl_table *table, int write,
+static int sched_itmt_update_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int old_sysctl;
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index df337860612d..cd8ed1edbf9e 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -12,6 +12,7 @@
#include <linux/kernel.h>
#include <linux/reboot.h>
#include <linux/serial_8250.h>
+#include <linux/acpi.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/acpi.h>
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index d0e49bd7c6f3..72e6a45e7ec2 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -40,12 +40,12 @@
#include <linux/kgdb.h>
#include <linux/ftrace.h>
#include <linux/kasan.h>
-#include <linux/moduleloader.h>
#include <linux/objtool.h>
#include <linux/vmalloc.h>
#include <linux/pgtable.h>
#include <linux/set_memory.h>
#include <linux/cfi.h>
+#include <linux/execmem.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -495,7 +495,7 @@ void *alloc_insn_page(void)
{
void *page;
- page = module_alloc(PAGE_SIZE);
+ page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index dd2ec14adb77..15af7e98e161 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -21,6 +21,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct kprobe_ctlblk *kcb;
int bit;
+ if (unlikely(kprobe_ftrace_disabled))
+ return;
+
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (bit < 0)
return;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7f0732bc0ccd..263f8aed4e2c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -44,7 +44,7 @@
#include <asm/svm.h>
#include <asm/e820/api.h>
-DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
static int kvmapf = 1;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index b180d8e497c3..9c9ac606893e 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -28,6 +28,7 @@
#include <asm/setup.h>
#include <asm/set_memory.h>
#include <asm/cpu.h>
+#include <asm/efi.h>
#ifdef CONFIG_ACPI
/*
@@ -87,6 +88,8 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
{
#ifdef CONFIG_EFI
unsigned long mstart, mend;
+ void *kaddr;
+ int ret;
if (!efi_enabled(EFI_BOOT))
return 0;
@@ -102,6 +105,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
if (!mstart)
return 0;
+ ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
+ if (ret)
+ return ret;
+
+ kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
+ if (!kaddr) {
+ pr_err("Could not map UEFI system table\n");
+ return -ENOMEM;
+ }
+
+ mstart = efi_config_table;
+
+ if (efi_enabled(EFI_64BIT)) {
+ efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr;
+
+ mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables;
+ } else {
+ efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr;
+
+ mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables;
+ }
+
+ memunmap(kaddr);
+
return kernel_ident_mapping_init(info, level4p, mstart, mend);
#endif
return 0;
@@ -295,8 +322,15 @@ void machine_kexec_cleanup(struct kimage *image)
void machine_kexec(struct kimage *image)
{
unsigned long page_list[PAGES_NR];
- void *control_page;
+ unsigned int host_mem_enc_active;
int save_ftrace_enabled;
+ void *control_page;
+
+ /*
+ * This must be done before load_segments() since if call depth tracking
+ * is used then GS must be valid to make any function calls.
+ */
+ host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -358,7 +392,7 @@ void machine_kexec(struct kimage *image)
(unsigned long)page_list,
image->start,
image->preserve_context,
- cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT));
+ host_mem_enc_active);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index c94dec6a1834..1f54eedc3015 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -9,6 +9,7 @@
#include <linux/pci.h>
#include <linux/dmi.h>
#include <linux/range.h>
+#include <linux/acpi.h>
#include <asm/pci-direct.h>
#include <linux/sort.h>
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e18914c0e38a..837450b6e882 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -36,57 +36,6 @@ do { \
} while (0)
#endif
-#ifdef CONFIG_RANDOMIZE_BASE
-static unsigned long module_load_offset;
-
-/* Mutex protects the module_load_offset. */
-static DEFINE_MUTEX(module_kaslr_mutex);
-
-static unsigned long int get_module_load_offset(void)
-{
- if (kaslr_enabled()) {
- mutex_lock(&module_kaslr_mutex);
- /*
- * Calculate the module_load_offset the first time this
- * code is called. Once calculated it stays the same until
- * reboot.
- */
- if (module_load_offset == 0)
- module_load_offset =
- get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
- mutex_unlock(&module_kaslr_mutex);
- }
- return module_load_offset;
-}
-#else
-static unsigned long int get_module_load_offset(void)
-{
- return 0;
-}
-#endif
-
-void *module_alloc(unsigned long size)
-{
- gfp_t gfp_mask = GFP_KERNEL;
- void *p;
-
- if (PAGE_ALIGN(size) > MODULES_LEN)
- return NULL;
-
- p = __vmalloc_node_range(size, MODULE_ALIGN,
- MODULES_VADDR + get_module_load_offset(),
- MODULES_END, gfp_mask, PAGE_KERNEL,
- VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
- NUMA_NO_NODE, __builtin_return_address(0));
-
- if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
- vfree(p);
- return NULL;
- }
-
- return p;
-}
-
#ifdef CONFIG_X86_32
int apply_relocate(Elf32_Shdr *sechdrs,
const char *strtab,
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e89171b0347a..4a1b1b28abf9 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -68,7 +68,7 @@ static void __init mpc_oem_bus_info(struct mpc_bus *m, char *str)
{
memcpy(str, m->bustype, 6);
str[6] = 0;
- apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
+ apic_pr_verbose("Bus #%d is %s\n", m->busid, str);
}
static void __init MP_bus_info(struct mpc_bus *m)
@@ -417,7 +417,7 @@ static unsigned long __init get_mpc_size(unsigned long physptr)
mpc = early_memremap(physptr, PAGE_SIZE);
size = mpc->length;
early_memunmap(mpc, PAGE_SIZE);
- apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
+ apic_pr_verbose(" mpc: %lx-%lx\n", physptr, physptr + size);
return size;
}
@@ -560,8 +560,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
struct mpf_intel *mpf;
int ret = 0;
- apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
- base, base + length - 1);
+ apic_pr_verbose("Scan for SMP in [mem %#010lx-%#010lx]\n", base, base + length - 1);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
@@ -683,13 +682,13 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
{
int i;
- apic_printk(APIC_VERBOSE, "OLD ");
+ apic_pr_verbose("OLD ");
print_mp_irq_info(m);
i = get_MP_intsrc_index(m);
if (i > 0) {
memcpy(m, &mp_irqs[i], sizeof(*m));
- apic_printk(APIC_VERBOSE, "NEW ");
+ apic_pr_verbose("NEW ");
print_mp_irq_info(&mp_irqs[i]);
return;
}
@@ -772,7 +771,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
continue;
if (nr_m_spare > 0) {
- apic_printk(APIC_VERBOSE, "*NEW* found\n");
+ apic_pr_verbose("*NEW* found\n");
nr_m_spare--;
memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
m_spare[nr_m_spare] = NULL;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5358d43886ad..fec381533555 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -51,13 +51,12 @@ DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
#endif
-DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
+DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
void __init native_pv_lock_init(void)
{
- if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) &&
- !boot_cpu_has(X86_FEATURE_HYPERVISOR))
- static_branch_disable(&virt_spin_lock_key);
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ static_branch_enable(&virt_spin_lock_key);
}
static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b8441147eb5e..f63f8fd00a91 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -835,6 +835,13 @@ void __noreturn stop_this_cpu(void *dummy)
*/
cpumask_clear_cpu(cpu, &cpus_stop_mask);
+#ifdef CONFIG_SMP
+ if (smp_ops.stop_this_cpu) {
+ smp_ops.stop_this_cpu();
+ unreachable();
+ }
+#endif
+
for (;;) {
/*
* Use native_halt() so that memory contents don't change
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6d3d20e3e43a..226472332a70 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -798,6 +798,32 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
#define LAM_U57_BITS 6
+static void enable_lam_func(void *__mm)
+{
+ struct mm_struct *mm = __mm;
+ unsigned long lam;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
+ lam = mm_lam_cr3_mask(mm);
+ write_cr3(__read_cr3() | lam);
+ cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
+ }
+}
+
+static void mm_enable_lam(struct mm_struct *mm)
+{
+ mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
+ mm->context.untag_mask = ~GENMASK(62, 57);
+
+ /*
+ * Even though the process must still be single-threaded at this
+ * point, kernel threads may be using the mm. IPI those kernel
+ * threads if they exist.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
+ set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
+}
+
static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
{
if (!cpu_feature_enabled(X86_FEATURE_LAM))
@@ -814,25 +840,21 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
if (mmap_write_lock_killable(mm))
return -EINTR;
+ /*
+ * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from
+ * being enabled unless the process is single threaded:
+ */
if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
mmap_write_unlock(mm);
return -EBUSY;
}
- if (!nr_bits) {
- mmap_write_unlock(mm);
- return -EINVAL;
- } else if (nr_bits <= LAM_U57_BITS) {
- mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
- mm->context.untag_mask = ~GENMASK(62, 57);
- } else {
+ if (!nr_bits || nr_bits > LAM_U57_BITS) {
mmap_write_unlock(mm);
return -EINVAL;
}
- write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
- set_tlbstate_lam_mode(mm);
- set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
+ mm_enable_lam(mm);
mmap_write_unlock(mm);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f3130f762784..615922838c51 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,7 @@
#include <linux/delay.h>
#include <linux/objtool.h>
#include <linux/pgtable.h>
+#include <linux/kexec.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -529,7 +530,7 @@ static inline void kb_wait(void)
static inline void nmi_shootdown_cpus_on_restart(void);
-#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
+#if IS_ENABLED(CONFIG_KVM_X86)
/* RCU-protected callback to disable virtualization prior to reboot. */
static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
@@ -599,7 +600,7 @@ static void emergency_reboot_disable_virtualization(void)
}
#else
static void emergency_reboot_disable_virtualization(void) { }
-#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */
+#endif /* CONFIG_KVM_X86 */
void __attribute__((weak)) mach_reboot_fixups(void)
{
@@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void)
void native_machine_shutdown(void)
{
+ /*
+ * Call enc_kexec_begin() while all CPUs are still active and
+ * interrupts are enabled. This will allow all in-flight memory
+ * conversions to finish cleanly.
+ */
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_begin();
+
/* Stop the cpus and apics */
#ifdef CONFIG_X86_IO_APIC
/*
@@ -752,6 +761,9 @@ void native_machine_shutdown(void)
#ifdef CONFIG_X86_64
x86_platform.iommu_shutdown();
#endif
+
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_finish();
}
static void __machine_emergency_restart(int emergency)
@@ -868,6 +880,12 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
+
+ if (smp_ops.stop_this_cpu) {
+ smp_ops.stop_this_cpu();
+ unreachable();
+ }
+
/* Assume hlt works */
halt();
for (;;)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 56cab1bb25f5..e9e88c342f75 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -5,6 +5,8 @@
*/
#include <linux/linkage.h>
+#include <linux/stringify.h>
+#include <asm/alternative.h>
#include <asm/page_types.h>
#include <asm/kexec.h>
#include <asm/processor-flags.h>
@@ -145,16 +147,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* Set cr4 to a known state:
* - physical address extension enabled
* - 5-level paging, if it was enabled before
+ * - Machine check exception on TDX guest, if it was enabled before.
+ * Clearing MCE might not be allowed in TDX guests, depending on setup.
+ *
+ * Use R13 that contains the original CR4 value, read in relocate_kernel().
+ * PAE is always set in the original CR4.
*/
- movl $X86_CR4_PAE, %eax
- testq $X86_CR4_LA57, %r13
- jz 1f
- orl $X86_CR4_LA57, %eax
-1:
- movq %rax, %cr4
-
- jmp 1f
-1:
+ andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d
+ ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
+ movq %r13, %cr4
/* Flush the TLB (needed?) */
movq %r9, %cr3
@@ -165,10 +166,11 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* used by kexec. Flush the caches before copying the kernel.
*/
testq %r12, %r12
- jz 1f
+ jz .Lsme_off
wbinvd
-1:
+.Lsme_off:
+ /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
movq %rcx, %r11
call swap_pages
@@ -187,7 +189,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
*/
testq %r11, %r11
- jnz 1f
+ jnz .Lrelocate
xorl %eax, %eax
xorl %ebx, %ebx
xorl %ecx, %ecx
@@ -208,7 +210,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
ret
int3
-1:
+.Lrelocate:
popq %rdx
leaq PAGE_SIZE(%r10), %rsp
ANNOTATE_RETPOLINE_SAFE
@@ -257,7 +259,7 @@ SYM_CODE_END(virtual_mapped)
/* Do the copies */
SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
UNWIND_HINT_END_OF_STACK
- movq %rdi, %rcx /* Put the page_list in %rcx */
+ movq %rdi, %rcx /* Put the indirection_page in %rcx */
xorl %edi, %edi
xorl %esi, %esi
jmp 1f
@@ -288,18 +290,21 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
- movq %rdi, %rdx
- movq %rsi, %rax
+ movq %rdi, %rdx /* Save destination page to %rdx */
+ movq %rsi, %rax /* Save source page to %rax */
+ /* copy source page to swap page */
movq %r10, %rdi
movl $512, %ecx
rep ; movsq
+ /* copy destination page to source page */
movq %rax, %rdi
movq %rdx, %rsi
movl $512, %ecx
rep ; movsq
+ /* copy swap page to destination page */
movq %rdx, %rdi
movq %r10, %rsi
movl $512, %ecx
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 2e7066980f3e..51a849a79c98 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -10,7 +10,6 @@
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
#include <asm/time.h>
-#include <asm/intel-mid.h>
#include <asm/setup.h>
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e125e059e2c4..f1fea506e20f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -7,6 +7,7 @@
*/
#include <linux/acpi.h>
#include <linux/console.h>
+#include <linux/cpu.h>
#include <linux/crash_dump.h>
#include <linux/dma-map-ops.h>
#include <linux/efi.h>
@@ -163,7 +164,8 @@ unsigned long saved_video_mode;
static char __initdata command_line[COMMAND_LINE_SIZE];
#ifdef CONFIG_CMDLINE_BOOL
-static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+char builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+bool builtin_cmdline_added __ro_after_init;
#endif
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
@@ -753,6 +755,23 @@ void __init setup_arch(char **cmdline_p)
boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
#endif
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+ builtin_cmdline_added = true;
+#endif
+
+ strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
/*
* If we have OLPC OFW, we might end up relocating the fixmap due to
* reserve_top(), so do this before touching the ioremap area.
@@ -832,22 +851,6 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = __pa_symbol(__bss_start);
bss_resource.end = __pa_symbol(__bss_stop)-1;
-#ifdef CONFIG_CMDLINE_BOOL
-#ifdef CONFIG_CMDLINE_OVERRIDE
- strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
-#else
- if (builtin_cmdline[0]) {
- /* append boot loader cmdline to builtin */
- strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
- strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
- }
-#endif
-#endif
-
- strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
- *cmdline_p = command_line;
-
/*
* x86_configure_nx() is called before parse_early_param() to detect
* whether hardware doesn't support NX (so that the early EHCI debug
@@ -994,7 +997,6 @@ void __init setup_arch(char **cmdline_p)
mem_encrypt_setup_arch();
cc_random_init();
- efi_fake_memmap();
efi_find_mirror();
efi_esrt_init();
efi_mokvar_table_init();
@@ -1037,7 +1039,12 @@ void __init setup_arch(char **cmdline_p)
init_mem_mapping();
- idt_setup_early_pf();
+ /*
+ * init_mem_mapping() relies on the early IDT page fault handling.
+ * Now either enable FRED or install the real page fault handler
+ * for 64-bit in the IDT.
+ */
+ cpu_init_replace_early_idt();
/*
* Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
@@ -1107,8 +1114,6 @@ void __init setup_arch(char **cmdline_p)
*/
arch_reserve_crashkernel();
- memblock_find_dma_reserve();
-
if (!early_xdbc_setup_hardware())
early_xdbc_register_console();
@@ -1218,3 +1223,10 @@ static int __init register_kernel_offset_dumper(void)
return 0;
}
__initcall(register_kernel_offset_dumper);
+
+#ifdef CONFIG_HOTPLUG_CPU
+bool arch_cpu_is_hotpluggable(int cpu)
+{
+ return cpu > 0;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 59e15dd8d0f8..059685612362 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -163,8 +163,8 @@ static int shstk_setup(void)
if (features_enabled(ARCH_SHSTK_SHSTK))
return 0;
- /* Also not supported for 32 bit and x32 */
- if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall())
+ /* Also not supported for 32 bit */
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
return -EOPNOTSUPP;
size = adjust_shstk_size(0);
@@ -577,3 +577,19 @@ long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
return wrss_control(true);
return -EINVAL;
}
+
+int shstk_update_last_frame(unsigned long val)
+{
+ unsigned long ssp;
+
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ ssp = get_user_shstk_addr();
+ return write_user_shstk_64((u64 __user *)ssp, (u64)val);
+}
+
+bool shstk_is_enabled(void)
+{
+ return features_enabled(ARCH_SHSTK_SHSTK);
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 31b6f5dddfc2..5f441039b572 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,6 +61,24 @@ static inline int is_x32_frame(struct ksignal *ksig)
}
/*
+ * Enable all pkeys temporarily, so as to ensure that both the current
+ * execution stack as well as the alternate signal stack are writeable.
+ * The application can use any of the available pkeys to protect the
+ * alternate signal stack, and we don't know which one it is, so enable
+ * all. The PKRU register will be reset to init_pkru later in the flow,
+ * in fpu__clear_user_states(), and it is the application's responsibility
+ * to enable the appropriate pkey as the first step in the signal handler
+ * so that the handler does not segfault.
+ */
+static inline u32 sig_prepare_pkru(void)
+{
+ u32 orig_pkru = read_pkru();
+
+ write_pkru(0);
+ return orig_pkru;
+}
+
+/*
* Set up a signal frame.
*/
@@ -84,6 +102,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
+ u32 pkru;
/* redzone */
if (!ia32_frame)
@@ -138,9 +157,17 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
return (void __user *)-1L;
}
+ /* Update PKRU to enable access to the alternate signal stack. */
+ pkru = sig_prepare_pkru();
/* save i387 and extended state */
- if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size))
+ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) {
+ /*
+ * Restore PKRU to the original, user-defined value; disable
+ * extra pkeys enabled for the alternate signal stack, if any.
+ */
+ write_pkru(pkru);
return (void __user *)-1L;
+ }
return (void __user *)sp;
}
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index c12624bc82a3..ef654530bf5a 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -34,7 +34,7 @@
#include <asm/gsseg.h>
#ifdef CONFIG_IA32_EMULATION
-#include <asm/ia32_unistd.h>
+#include <asm/unistd_32_ia32.h>
static inline void reload_segments(struct sigcontext_32 *sc)
{
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 23d8aaf8d9fd..ee9453891901 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -260,13 +260,13 @@ SYSCALL_DEFINE0(rt_sigreturn)
set_current_blocked(&set);
- if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
- if (restore_signal_shadow_stack())
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
- if (restore_altstack(&frame->uc.uc_stack))
+ if (restore_signal_shadow_stack())
goto badframe;
return regs->ax;
@@ -315,6 +315,9 @@ int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
uc_flags = frame_uc_flags(regs);
+ if (setup_signal_shadow_stack(ksig))
+ return -EFAULT;
+
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
@@ -377,6 +380,9 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
+ if (restore_signal_shadow_stack())
+ goto badframe;
+
if (compat_restore_altstack(&frame->uc.uc_stack))
goto badframe;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 76bb65045c64..766f092dab80 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -60,6 +60,7 @@
#include <linux/stackprotector.h>
#include <linux/cpuhotplug.h>
#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
#include <asm/acpi.h>
#include <asm/cacheinfo.h>
@@ -246,7 +247,7 @@ static void notrace start_secondary(void *unused)
__flush_tlb_all();
}
- cpu_init_exception_handling();
+ cpu_init_exception_handling(false);
/*
* Load the microcode before reaching the AP alive synchronization
@@ -438,9 +439,9 @@ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
*/
static const struct x86_cpu_id intel_cod_cpu[] = {
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */
- X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */
+ X86_MATCH_VFM(INTEL_HASWELL_X, 0), /* COD */
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0), /* COD */
+ X86_MATCH_VFM(INTEL_ANY, 1), /* SNC */
{}
};
@@ -1033,20 +1034,22 @@ static __init void disable_smp(void)
void __init smp_prepare_cpus_common(void)
{
- unsigned int i;
+ unsigned int cpu, node;
/* Mark all except the boot CPU as hotpluggable */
- for_each_possible_cpu(i) {
- if (i)
- per_cpu(cpu_info.cpu_index, i) = nr_cpu_ids;
+ for_each_possible_cpu(cpu) {
+ if (cpu)
+ per_cpu(cpu_info.cpu_index, cpu) = nr_cpu_ids;
}
- for_each_possible_cpu(i) {
- zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
+ for_each_possible_cpu(cpu) {
+ node = cpu_to_node(cpu);
+
+ zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_die_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_llc_shared_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_l2c_shared_map, cpu), GFP_KERNEL, node);
}
set_cpu_sibling_map(0);
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index cb9fa1d5c66f..87f8c9a71c49 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -112,13 +112,21 @@ static void find_start_end(unsigned long addr, unsigned long flags,
*end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
}
+static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
+{
+ if (vm_flags & VM_SHADOW_STACK)
+ return PAGE_SIZE;
+
+ return 0;
+}
+
unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
unsigned long begin, end;
if (flags & MAP_FIXED)
@@ -137,12 +145,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
return addr;
}
- info.flags = 0;
info.length = len;
info.low_limit = begin;
info.high_limit = end;
- info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
+ info.start_gap = stack_guard_placement(vm_flags);
if (filp) {
info.align_mask = get_align_mask();
info.align_offset += get_align_bits();
@@ -151,14 +158,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
}
unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t vm_flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
/* requested length too big for entire address space */
if (len > TASK_SIZE)
@@ -192,6 +199,7 @@ get_unmapped_area:
info.low_limit = PAGE_SIZE;
info.high_limit = get_mmap_base(0);
+ info.start_gap = stack_guard_placement(vm_flags);
/*
* If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
@@ -203,7 +211,6 @@ get_unmapped_area:
if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
- info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
if (filp) {
info.align_mask = get_align_mask();
@@ -221,5 +228,5 @@ bottomup:
* can happen with large stack limits and large mmap()
* allocations.
*/
- return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+ return arch_get_unmapped_area(filp, addr0, len, pgoff, flags, 0);
}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index e42faa792c07..52e1f3f0b361 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -27,25 +27,7 @@
unsigned long profile_pc(struct pt_regs *regs)
{
- unsigned long pc = instruction_pointer(regs);
-
- if (!user_mode(regs) && in_lock_functions(pc)) {
-#ifdef CONFIG_FRAME_POINTER
- return *(unsigned long *)(regs->bp + sizeof(long));
-#else
- unsigned long *sp = (unsigned long *)regs->sp;
- /*
- * Return address is either directly at stack pointer
- * or above a saved flags. Eflags has bits 22-31 zero,
- * kernel addresses don't.
- */
- if (sp[0] >> 22)
- return sp[0];
- if (sp[1] >> 22)
- return sp[1];
-#endif
- }
- return pc;
+ return instruction_pointer(regs);
}
EXPORT_SYMBOL(profile_pc);
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
deleted file mode 100644
index d42c28b8bfd8..000000000000
--- a/arch/x86/kernel/topology.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Populate sysfs with topology information
- *
- * Written by: Matthew Dobson, IBM Corporation
- * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- */
-#include <linux/interrupt.h>
-#include <linux/nodemask.h>
-#include <linux/export.h>
-#include <linux/mmzone.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/irq.h>
-#include <asm/io_apic.h>
-#include <asm/cpu.h>
-
-#ifdef CONFIG_HOTPLUG_CPU
-bool arch_cpu_is_hotpluggable(int cpu)
-{
- return cpu > 0;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4fa0b17e5043..d05392db5d0f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -42,6 +42,7 @@
#include <linux/hardirq.h>
#include <linux/atomic.h>
#include <linux/iommu.h>
+#include <linux/ubsan.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
@@ -91,6 +92,47 @@ __always_inline int is_valid_bugaddr(unsigned long addr)
return *(unsigned short *)addr == INSN_UD2;
}
+/*
+ * Check for UD1 or UD2, accounting for Address Size Override Prefixes.
+ * If it's a UD1, get the ModRM byte to pass along to UBSan.
+ */
+__always_inline int decode_bug(unsigned long addr, u32 *imm)
+{
+ u8 v;
+
+ if (addr < TASK_SIZE_MAX)
+ return BUG_NONE;
+
+ v = *(u8 *)(addr++);
+ if (v == INSN_ASOP)
+ v = *(u8 *)(addr++);
+ if (v != OPCODE_ESCAPE)
+ return BUG_NONE;
+
+ v = *(u8 *)(addr++);
+ if (v == SECOND_BYTE_OPCODE_UD2)
+ return BUG_UD2;
+
+ if (!IS_ENABLED(CONFIG_UBSAN_TRAP) || v != SECOND_BYTE_OPCODE_UD1)
+ return BUG_NONE;
+
+ /* Retrieve the immediate (type value) for the UBSAN UD1 */
+ v = *(u8 *)(addr++);
+ if (X86_MODRM_RM(v) == 4)
+ addr++;
+
+ *imm = 0;
+ if (X86_MODRM_MOD(v) == 1)
+ *imm = *(u8 *)addr;
+ else if (X86_MODRM_MOD(v) == 2)
+ *imm = *(u32 *)addr;
+ else
+ WARN_ONCE(1, "Unexpected MODRM_MOD: %u\n", X86_MODRM_MOD(v));
+
+ return BUG_UD1;
+}
+
+
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
struct pt_regs *regs, long error_code)
@@ -216,6 +258,8 @@ static inline void handle_invalid_op(struct pt_regs *regs)
static noinstr bool handle_bug(struct pt_regs *regs)
{
bool handled = false;
+ int ud_type;
+ u32 imm;
/*
* Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
@@ -223,7 +267,8 @@ static noinstr bool handle_bug(struct pt_regs *regs)
* irqentry_enter().
*/
kmsan_unpoison_entry_regs(regs);
- if (!is_valid_bugaddr(regs->ip))
+ ud_type = decode_bug(regs->ip, &imm);
+ if (ud_type == BUG_NONE)
return handled;
/*
@@ -236,10 +281,14 @@ static noinstr bool handle_bug(struct pt_regs *regs)
*/
if (regs->flags & X86_EFLAGS_IF)
raw_local_irq_enable();
- if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN ||
- handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
- regs->ip += LEN_UD2;
- handled = true;
+ if (ud_type == BUG_UD2) {
+ if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN ||
+ handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
+ regs->ip += LEN_UD2;
+ handled = true;
+ }
+ } else if (IS_ENABLED(CONFIG_UBSAN_TRAP)) {
+ pr_crit("%s at %pS\n", report_ubsan_failure(regs, imm), (void *)regs->ip);
}
if (regs->flags & X86_EFLAGS_IF)
raw_local_irq_disable();
@@ -1402,34 +1451,8 @@ DEFINE_IDTENTRY_SW(iret_error)
}
#endif
-/* Do not enable FRED by default yet. */
-static bool enable_fred __ro_after_init = false;
-
-#ifdef CONFIG_X86_FRED
-static int __init fred_setup(char *str)
-{
- if (!str)
- return -EINVAL;
-
- if (!cpu_feature_enabled(X86_FEATURE_FRED))
- return 0;
-
- if (!strcmp(str, "on"))
- enable_fred = true;
- else if (!strcmp(str, "off"))
- enable_fred = false;
- else
- pr_warn("invalid FRED option: 'fred=%s'\n", str);
- return 0;
-}
-early_param("fred", fred_setup);
-#endif
-
void __init trap_init(void)
{
- if (cpu_feature_enabled(X86_FEATURE_FRED) && !enable_fred)
- setup_clear_cpu_cap(X86_FEATURE_FRED);
-
/* Init cpu_entry_area before IST entries are set up */
setup_cpu_entry_areas();
@@ -1437,7 +1460,7 @@ void __init trap_init(void)
sev_es_init_vc_handling();
/* Initialize TSS before setting up traps so ISTs work */
- cpu_init_exception_handling();
+ cpu_init_exception_handling(true);
/* Setup traps as cpu_init() might #GP */
if (!cpu_feature_enabled(X86_FEATURE_FRED))
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5a69a49acc96..dfe6847fd99e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -26,8 +26,9 @@
#include <asm/x86_init.h>
#include <asm/geode.h>
#include <asm/apic.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/i8259.h>
+#include <asm/topology.h>
#include <asm/uv/uv.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
@@ -44,15 +45,15 @@ EXPORT_SYMBOL(tsc_khz);
static int __read_mostly tsc_unstable;
static unsigned int __initdata tsc_early_khz;
-static DEFINE_STATIC_KEY_FALSE(__use_tsc);
+static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc);
int tsc_clocksource_reliable;
static int __read_mostly tsc_force_recalibrate;
-static u32 art_to_tsc_numerator;
-static u32 art_to_tsc_denominator;
-static u64 art_to_tsc_offset;
+static struct clocksource_base art_base_clk = {
+ .id = CSID_X86_ART,
+};
static bool have_art;
struct cyc2ns {
@@ -682,7 +683,7 @@ unsigned long native_calibrate_tsc(void)
* clock.
*/
if (crystal_khz == 0 &&
- boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D)
+ boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
crystal_khz = 25000;
/*
@@ -713,7 +714,7 @@ unsigned long native_calibrate_tsc(void)
* For Atom SoCs TSC is the only reliable clocksource.
* Mark TSC reliable so no watchdog on it.
*/
- if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
+ if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
#ifdef CONFIG_X86_LOCAL_APIC
@@ -1074,7 +1075,7 @@ core_initcall(cpufreq_register_tsc_scaling);
*/
static void __init detect_art(void)
{
- unsigned int unused[2];
+ unsigned int unused;
if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
return;
@@ -1089,13 +1090,14 @@ static void __init detect_art(void)
tsc_async_resets)
return;
- cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
- &art_to_tsc_numerator, unused, unused+1);
+ cpuid(ART_CPUID_LEAF, &art_base_clk.denominator,
+ &art_base_clk.numerator, &art_base_clk.freq_khz, &unused);
- if (art_to_tsc_denominator < ART_MIN_DENOMINATOR)
+ art_base_clk.freq_khz /= KHZ;
+ if (art_base_clk.denominator < ART_MIN_DENOMINATOR)
return;
- rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset);
+ rdmsrl(MSR_IA32_TSC_ADJUST, art_base_clk.offset);
/* Make this sticky over multiple CPU init calls */
setup_force_cpu_cap(X86_FEATURE_ART);
@@ -1252,15 +1254,12 @@ static void __init check_system_tsc_reliable(void)
* - TSC which does not stop in C-States
* - the TSC_ADJUST register which allows to detect even minimal
* modifications
- * - not more than two sockets. As the number of sockets cannot be
- * evaluated at the early boot stage where this has to be
- * invoked, check the number of online memory nodes as a
- * fallback solution which is an reasonable estimate.
+ * - not more than four packages
*/
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
- nr_online_nodes <= 4)
+ topology_max_packages() <= 4)
tsc_disable_clocksource_watchdog();
}
@@ -1289,74 +1288,13 @@ int unsynchronized_tsc(void)
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
/* assume multi socket systems are not synchronized: */
- if (num_possible_cpus() > 1)
+ if (topology_max_packages() > 1)
return 1;
}
return 0;
}
-/*
- * Convert ART to TSC given numerator/denominator found in detect_art()
- */
-struct system_counterval_t convert_art_to_tsc(u64 art)
-{
- u64 tmp, res, rem;
-
- rem = do_div(art, art_to_tsc_denominator);
-
- res = art * art_to_tsc_numerator;
- tmp = rem * art_to_tsc_numerator;
-
- do_div(tmp, art_to_tsc_denominator);
- res += tmp + art_to_tsc_offset;
-
- return (struct system_counterval_t) {
- .cs_id = have_art ? CSID_X86_TSC : CSID_GENERIC,
- .cycles = res,
- };
-}
-EXPORT_SYMBOL(convert_art_to_tsc);
-
-/**
- * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC.
- * @art_ns: ART (Always Running Timer) in unit of nanoseconds
- *
- * PTM requires all timestamps to be in units of nanoseconds. When user
- * software requests a cross-timestamp, this function converts system timestamp
- * to TSC.
- *
- * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set
- * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check
- * that this flag is set before conversion to TSC is attempted.
- *
- * Return:
- * struct system_counterval_t - system counter value with the ID of the
- * corresponding clocksource:
- * cycles: System counter value
- * cs_id: The clocksource ID for validating comparability
- */
-
-struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns)
-{
- u64 tmp, res, rem;
-
- rem = do_div(art_ns, USEC_PER_SEC);
-
- res = art_ns * tsc_khz;
- tmp = rem * tsc_khz;
-
- do_div(tmp, USEC_PER_SEC);
- res += tmp;
-
- return (struct system_counterval_t) {
- .cs_id = have_art ? CSID_X86_TSC : CSID_GENERIC,
- .cycles = res,
- };
-}
-EXPORT_SYMBOL(convert_art_ns_to_tsc);
-
-
static void tsc_refine_calibration_work(struct work_struct *work);
static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
/**
@@ -1458,8 +1396,10 @@ out:
if (tsc_unstable)
goto unreg;
- if (boot_cpu_has(X86_FEATURE_ART))
+ if (boot_cpu_has(X86_FEATURE_ART)) {
have_art = true;
+ clocksource_tsc.base = &art_base_clk;
+ }
clocksource_register_khz(&clocksource_tsc, tsc_khz);
unreg:
clocksource_unregister(&clocksource_tsc_early);
@@ -1484,8 +1424,10 @@ static int __init init_tsc_clocksource(void)
* the refined calibration and directly register it as a clocksource.
*/
if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
- if (boot_cpu_has(X86_FEATURE_ART))
+ if (boot_cpu_has(X86_FEATURE_ART)) {
have_art = true;
+ clocksource_tsc.base = &art_base_clk;
+ }
clocksource_register_khz(&clocksource_tsc, tsc_khz);
clocksource_unregister(&clocksource_tsc_early);
@@ -1509,10 +1451,12 @@ static bool __init determine_cpu_tsc_frequencies(bool early)
if (early) {
cpu_khz = x86_platform.calibrate_cpu();
- if (tsc_early_khz)
+ if (tsc_early_khz) {
tsc_khz = tsc_early_khz;
- else
+ } else {
tsc_khz = x86_platform.calibrate_tsc();
+ clocksource_tsc.freq_khz = tsc_khz;
+ }
} else {
/* We should not be here with non-native cpu calibration */
WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 6555a857a1e6..deeb02825670 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -147,13 +147,13 @@ static const struct freq_desc freq_desc_lgm = {
};
static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_MID, &freq_desc_pnw),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_TABLET,&freq_desc_clv),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &freq_desc_byt),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &freq_desc_tng),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &freq_desc_cht),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &freq_desc_ann),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_NP, &freq_desc_lgm),
+ X86_MATCH_VFM(INTEL_ATOM_SALTWELL_MID, &freq_desc_pnw),
+ X86_MATCH_VFM(INTEL_ATOM_SALTWELL_TABLET, &freq_desc_clv),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &freq_desc_ann),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm),
{}
};
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 1123ef3ccf90..4334033658ed 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -193,11 +193,9 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
cur->warned = false;
/*
- * If a non-zero TSC value for socket 0 may be valid then the default
- * adjusted value cannot assumed to be zero either.
+ * The default adjust value cannot be assumed to be zero on any socket.
*/
- if (tsc_async_resets)
- cur->adjusted = bootval;
+ cur->adjusted = bootval;
/*
* Check whether this CPU is the first in a package to come up. In
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c07f6daaa22..5a952c5ea66b 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -12,6 +12,7 @@
#include <linux/ptrace.h>
#include <linux/uprobes.h>
#include <linux/uaccess.h>
+#include <linux/syscalls.h>
#include <linux/kdebug.h>
#include <asm/processor.h>
@@ -308,6 +309,122 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
}
#ifdef CONFIG_X86_64
+
+asm (
+ ".pushsection .rodata\n"
+ ".global uretprobe_trampoline_entry\n"
+ "uretprobe_trampoline_entry:\n"
+ "pushq %rax\n"
+ "pushq %rcx\n"
+ "pushq %r11\n"
+ "movq $" __stringify(__NR_uretprobe) ", %rax\n"
+ "syscall\n"
+ ".global uretprobe_syscall_check\n"
+ "uretprobe_syscall_check:\n"
+ "popq %r11\n"
+ "popq %rcx\n"
+
+ /* The uretprobe syscall replaces stored %rax value with final
+ * return address, so we don't restore %rax in here and just
+ * call ret.
+ */
+ "retq\n"
+ ".global uretprobe_trampoline_end\n"
+ "uretprobe_trampoline_end:\n"
+ ".popsection\n"
+);
+
+extern u8 uretprobe_trampoline_entry[];
+extern u8 uretprobe_trampoline_end[];
+extern u8 uretprobe_syscall_check[];
+
+void *arch_uprobe_trampoline(unsigned long *psize)
+{
+ static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ struct pt_regs *regs = task_pt_regs(current);
+
+ /*
+ * At the moment the uretprobe syscall trampoline is supported
+ * only for native 64-bit process, the compat process still uses
+ * standard breakpoint.
+ */
+ if (user_64bit_mode(regs)) {
+ *psize = uretprobe_trampoline_end - uretprobe_trampoline_entry;
+ return uretprobe_trampoline_entry;
+ }
+
+ *psize = UPROBE_SWBP_INSN_SIZE;
+ return &insn;
+}
+
+static unsigned long trampoline_check_ip(void)
+{
+ unsigned long tramp = uprobe_get_trampoline_vaddr();
+
+ return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry);
+}
+
+SYSCALL_DEFINE0(uretprobe)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ unsigned long err, ip, sp, r11_cx_ax[3];
+
+ if (regs->ip != trampoline_check_ip())
+ goto sigill;
+
+ err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax));
+ if (err)
+ goto sigill;
+
+ /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */
+ regs->r11 = r11_cx_ax[0];
+ regs->cx = r11_cx_ax[1];
+ regs->ax = r11_cx_ax[2];
+ regs->sp += sizeof(r11_cx_ax);
+ regs->orig_ax = -1;
+
+ ip = regs->ip;
+ sp = regs->sp;
+
+ uprobe_handle_trampoline(regs);
+
+ /*
+ * Some of the uprobe consumers has changed sp, we can do nothing,
+ * just return via iret.
+ * .. or shadow stack is enabled, in which case we need to skip
+ * return through the user space stack address.
+ */
+ if (regs->sp != sp || shstk_is_enabled())
+ return regs->ax;
+ regs->sp -= sizeof(r11_cx_ax);
+
+ /* for the case uprobe_consumer has changed r11/cx */
+ r11_cx_ax[0] = regs->r11;
+ r11_cx_ax[1] = regs->cx;
+
+ /*
+ * ax register is passed through as return value, so we can use
+ * its space on stack for ip value and jump to it through the
+ * trampoline's ret instruction
+ */
+ r11_cx_ax[2] = regs->ip;
+ regs->ip = ip;
+
+ err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax));
+ if (err)
+ goto sigill;
+
+ /* ensure sysret, see do_syscall_64() */
+ regs->r11 = regs->flags;
+ regs->cx = regs->ip;
+
+ return regs->ax;
+
+sigill:
+ force_sig(SIGILL);
+ return -1;
+}
+
/*
* If arch_uprobe->insn doesn't use rip-relative addressing, return
* immediately. Otherwise, rewrite the instruction so that it accesses
@@ -1076,8 +1193,13 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
return orig_ret_vaddr;
nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
- if (likely(!nleft))
+ if (likely(!nleft)) {
+ if (shstk_update_last_frame(trampoline_vaddr)) {
+ force_sig(SIGSEGV);
+ return -1;
+ }
return orig_ret_vaddr;
+ }
if (nleft != rasize) {
pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 56451fd2099e..6726be89b7a6 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -15,11 +15,7 @@
* put it inside the section definition.
*/
-#ifdef CONFIG_X86_32
-#define LOAD_OFFSET __PAGE_OFFSET
-#else
#define LOAD_OFFSET __START_KERNEL_map
-#endif
#define RUNTIME_DISCARD_EXIT
#define EMITS_PT_NOTE
@@ -114,11 +110,10 @@ PHDRS {
SECTIONS
{
+ . = __START_KERNEL;
#ifdef CONFIG_X86_32
- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET);
#else
- . = __START_KERNEL;
phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET);
#endif
@@ -172,6 +167,9 @@ SECTIONS
/* init_task */
INIT_TASK_DATA(THREAD_SIZE)
+ /* equivalent to task_pt_regs(&init_task) */
+ __top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE;
+
#ifdef CONFIG_X86_32
/* 32 bit has nosave before _edata */
NOSAVE_DATA
@@ -359,6 +357,8 @@ SECTIONS
PERCPU_SECTION(INTERNODE_CACHE_BYTES)
#endif
+ RUNTIME_CONST_VARIABLES
+
. = ALIGN(PAGE_SIZE);
/* freed after init ends here */
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index d5dc5a92635a..0a2bbd674a6d 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -8,6 +8,7 @@
#include <linux/ioport.h>
#include <linux/export.h>
#include <linux/pci.h>
+#include <linux/acpi.h>
#include <asm/acpi.h>
#include <asm/bios_ebda.h>
@@ -134,10 +135,12 @@ struct x86_cpuinit_ops x86_cpuinit = {
static void default_nmi_init(void) { };
-static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; }
-static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return true; }
+static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
+static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
static bool enc_tlb_flush_required_noop(bool enc) { return false; }
static bool enc_cache_flush_required_noop(void) { return false; }
+static void enc_kexec_begin_noop(void) {}
+static void enc_kexec_finish_noop(void) {}
static bool is_private_mmio_noop(u64 addr) {return false; }
struct x86_platform_ops x86_platform __ro_after_init = {
@@ -161,6 +164,8 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.enc_status_change_finish = enc_status_change_finish_noop,
.enc_tlb_flush_required = enc_tlb_flush_required_noop,
.enc_cache_flush_required = enc_cache_flush_required_noop,
+ .enc_kexec_begin = enc_kexec_begin_noop,
+ .enc_kexec_finish = enc_kexec_finish_noop,
},
};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 0ebdd088f28b..f09f13c01c6b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -17,9 +17,8 @@ menuconfig VIRTUALIZATION
if VIRTUALIZATION
-config KVM
- tristate "Kernel-based Virtual Machine (KVM) support"
- depends on HIGH_RES_TIMERS
+config KVM_X86
+ def_tristate KVM if KVM_INTEL || KVM_AMD
depends on X86_LOCAL_APIC
select KVM_COMMON
select KVM_GENERIC_MMU_NOTIFIER
@@ -44,6 +43,12 @@ config KVM
select KVM_VFIO
select HAVE_KVM_PM_NOTIFIER if PM
select KVM_GENERIC_HARDWARE_ENABLING
+ select KVM_GENERIC_PRE_FAULT_MEMORY
+ select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM
+ select KVM_WERROR if WERROR
+
+config KVM
+ tristate "Kernel-based Virtual Machine (KVM) support"
help
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
@@ -66,7 +71,7 @@ config KVM_WERROR
# FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning.
# Building KVM with -Werror and KASAN is still doable via enabling
# the kernel-wide WERROR=y.
- depends on KVM && EXPERT && !KASAN
+ depends on KVM && ((EXPERT && !KASAN) || WERROR)
help
Add -Werror to the build flags for KVM.
@@ -76,7 +81,6 @@ config KVM_SW_PROTECTED_VM
bool "Enable support for KVM software-protected VMs"
depends on EXPERT
depends on KVM && X86_64
- select KVM_GENERIC_PRIVATE_MEM
help
Enable support for KVM software-protected VMs. Currently, software-
protected VMs are purely a development and testing vehicle for
@@ -95,6 +99,21 @@ config KVM_INTEL
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config KVM_INTEL_PROVE_VE
+ bool "Check that guests do not receive #VE exceptions"
+ depends on KVM_INTEL && EXPERT
+ help
+ Checks that KVM's page table management code will not incorrectly
+ let guests receive a virtualization exception. Virtualization
+ exceptions will be trapped by the hypervisor rather than injected
+ in the guest.
+
+ Note: some CPUs appear to generate spurious EPT Violations #VEs
+ that trigger KVM's WARN, in particular with eptad=0 and/or nested
+ virtualization.
+
+ If unsure, say N.
+
config X86_SGX_KVM
bool "Software Guard eXtensions (SGX) Virtualization"
depends on X86_SGX && KVM_INTEL
@@ -123,9 +142,14 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
+ select KVM_GENERIC_PRIVATE_MEM
+ select HAVE_KVM_ARCH_GMEM_PREPARE
+ select HAVE_KVM_ARCH_GMEM_INVALIDATE
help
- Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
- with Encrypted State (SEV-ES) on AMD processors.
+ Provides support for launching encrypted VMs which use Secure
+ Encrypted Virtualization (SEV), Secure Encrypted Virtualization with
+ Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
+ Secure Nested Paging (SEV-SNP) technologies on AMD processors.
config KVM_SMM
bool "System Management Mode emulation"
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index addc44fc7187..f9dddb8cb466 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -16,14 +16,15 @@ kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-$(CONFIG_KVM_SMM) += smm.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
- vmx/nested.o vmx/posted_intr.o
+ vmx/nested.o vmx/posted_intr.o vmx/main.o
kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o
-kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
- svm/sev.o
-kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o
+kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o
+
+kvm-amd-$(CONFIG_KVM_AMD_SEV) += svm/sev.o
+kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o
ifdef CONFIG_HYPERV
kvm-y += kvm_onhyperv.o
@@ -31,7 +32,7 @@ kvm-intel-y += vmx/vmx_onhyperv.o vmx/hyperv_evmcs.o
kvm-amd-y += svm/svm_onhyperv.o
endif
-obj-$(CONFIG_KVM) += kvm.o
+obj-$(CONFIG_KVM_X86) += kvm.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 77352a4abd87..41786b834b16 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -335,6 +335,18 @@ static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent)
#endif
}
+static bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0);
+ if (!entry)
+ return false;
+
+ return is_guest_vendor_amd(entry->ebx, entry->ecx, entry->edx) ||
+ is_guest_vendor_hygon(entry->ebx, entry->ecx, entry->edx);
+}
+
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -388,7 +400,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.cpuid_nent));
/* Invoke the vendor callback only after the above state is updated. */
- static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
+ kvm_x86_call(vcpu_after_set_cpuid)(vcpu);
/*
* Except for the MMU, which needs to do its thing any vendor specific
@@ -693,7 +705,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) |
- F(AMX_COMPLEX)
+ F(AMX_COMPLEX) | F(AVX10)
);
kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
@@ -709,6 +721,10 @@ void kvm_set_cpu_caps(void)
SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
);
+ kvm_cpu_cap_init_kvm_defined(CPUID_24_0_EBX,
+ F(AVX10_128) | F(AVX10_256) | F(AVX10_512)
+ );
+
kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
@@ -772,7 +788,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
kvm_cpu_cap_mask(CPUID_8000_001F_EAX,
- 0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) |
+ 0 /* SME */ | 0 /* SEV */ | 0 /* VM_PAGE_FLUSH */ | 0 /* SEV_ES */ |
F(SME_COHERENT));
kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
@@ -937,7 +953,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
switch (function) {
case 0:
/* Limited to the highest leaf implemented in KVM. */
- entry->eax = min(entry->eax, 0x1fU);
+ entry->eax = min(entry->eax, 0x24U);
break;
case 1:
cpuid_entry_override(entry, CPUID_1_EDX);
@@ -1162,6 +1178,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
break;
}
break;
+ case 0x24: {
+ u8 avx10_version;
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ /*
+ * The AVX10 version is encoded in EBX[7:0]. Note, the version
+ * is guaranteed to be >=1 if AVX10 is supported. Note #2, the
+ * version needs to be captured before overriding EBX features!
+ */
+ avx10_version = min_t(u8, entry->ebx & 0xff, 1);
+ cpuid_entry_override(entry, CPUID_24_0_EBX);
+ entry->ebx |= avx10_version;
+
+ entry->eax = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ }
case KVM_CPUID_SIGNATURE: {
const u32 *sigptr = (const u32 *)KVM_SIGNATURE;
entry->eax = KVM_CPUID_FEATURES;
@@ -1232,9 +1270,22 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->eax = entry->ebx = entry->ecx = 0;
break;
case 0x80000008: {
- unsigned g_phys_as = (entry->eax >> 16) & 0xff;
- unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
- unsigned phys_as = entry->eax & 0xff;
+ /*
+ * GuestPhysAddrSize (EAX[23:16]) is intended for software
+ * use.
+ *
+ * KVM's ABI is to report the effective MAXPHYADDR for the
+ * guest in PhysAddrSize (phys_as), and the maximum
+ * *addressable* GPA in GuestPhysAddrSize (g_phys_as).
+ *
+ * GuestPhysAddrSize is valid if and only if TDP is enabled,
+ * in which case the max GPA that can be addressed by KVM may
+ * be less than the max GPA that can be legally generated by
+ * the guest, e.g. if MAXPHYADDR>48 but the CPU doesn't
+ * support 5-level TDP.
+ */
+ unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U);
+ unsigned int phys_as, g_phys_as;
/*
* If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as
@@ -1242,16 +1293,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
* reductions in MAXPHYADDR for memory encryption affect shadow
* paging, too.
*
- * If TDP is enabled but an explicit guest MAXPHYADDR is not
- * provided, use the raw bare metal MAXPHYADDR as reductions to
- * the HPAs do not affect GPAs.
+ * If TDP is enabled, use the raw bare metal MAXPHYADDR as
+ * reductions to the HPAs do not affect GPAs. The max
+ * addressable GPA is the same as the max effective GPA, except
+ * that it's capped at 48 bits if 5-level TDP isn't supported
+ * (hardware processes bits 51:48 only when walking the fifth
+ * level page table).
*/
- if (!tdp_enabled)
- g_phys_as = boot_cpu_data.x86_phys_bits;
- else if (!g_phys_as)
+ if (!tdp_enabled) {
+ phys_as = boot_cpu_data.x86_phys_bits;
+ g_phys_as = 0;
+ } else {
+ phys_as = entry->eax & 0xff;
g_phys_as = phys_as;
+ if (kvm_mmu_get_max_tdp_level() < 5)
+ g_phys_as = min(g_phys_as, 48);
+ }
- entry->eax = g_phys_as | (virt_as << 8);
+ entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16);
entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8));
entry->edx = 0;
cpuid_entry_override(entry, CPUID_8000_0008_EBX);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 23dbb9eb277c..41697cca354e 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -102,24 +102,6 @@ static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu,
*reg &= ~__feature_bit(x86_feature);
}
-static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0);
- return best &&
- (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) ||
- is_guest_vendor_hygon(best->ebx, best->ecx, best->edx));
-}
-
-static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0);
- return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
-}
-
static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu)
{
return vcpu->arch.is_amd_compatible;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5d4c86133453..e72aed25d721 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1069,7 +1069,7 @@ static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
asm("push %[flags]; popf; " CALL_NOSPEC
- : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));
+ : "=a"(rc), ASM_CALL_CONSTRAINT : [thunk_target]"r"(fop), [flags]"r"(flags));
return rc;
}
@@ -2354,50 +2354,6 @@ setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss)
ss->avl = 0;
}
-static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
-{
- u32 eax, ebx, ecx, edx;
-
- eax = ecx = 0;
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
- return is_guest_vendor_intel(ebx, ecx, edx);
-}
-
-static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
-{
- const struct x86_emulate_ops *ops = ctxt->ops;
- u32 eax, ebx, ecx, edx;
-
- /*
- * syscall should always be enabled in longmode - so only become
- * vendor specific (cpuid) if other modes are active...
- */
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- return true;
-
- eax = 0x00000000;
- ecx = 0x00000000;
- ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
- /*
- * remark: Intel CPUs only support "syscall" in 64bit longmode. Also a
- * 64bit guest with a 32bit compat-app running will #UD !! While this
- * behaviour can be fixed (by emulating) into AMD response - CPUs of
- * AMD can't behave like Intel.
- */
- if (is_guest_vendor_intel(ebx, ecx, edx))
- return false;
-
- if (is_guest_vendor_amd(ebx, ecx, edx) ||
- is_guest_vendor_hygon(ebx, ecx, edx))
- return true;
-
- /*
- * default: (not Intel, not AMD, not Hygon), apply Intel's
- * stricter rules...
- */
- return false;
-}
-
static int em_syscall(struct x86_emulate_ctxt *ctxt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
@@ -2411,7 +2367,15 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->mode == X86EMUL_MODE_VM86)
return emulate_ud(ctxt);
- if (!(em_syscall_is_enabled(ctxt)))
+ /*
+ * Intel compatible CPUs only support SYSCALL in 64-bit mode, whereas
+ * AMD allows SYSCALL in any flavor of protected mode. Note, it's
+ * infeasible to emulate Intel behavior when running on AMD hardware,
+ * as SYSCALL won't fault in the "wrong" mode, i.e. there is no #UD
+ * for KVM to trap-and-emulate, unlike emulating AMD on Intel.
+ */
+ if (ctxt->mode != X86EMUL_MODE_PROT64 &&
+ ctxt->ops->guest_cpuid_is_intel_compatible(ctxt))
return emulate_ud(ctxt);
ops->get_msr(ctxt, MSR_EFER, &efer);
@@ -2471,11 +2435,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
return emulate_gp(ctxt, 0);
/*
- * Not recognized on AMD in compat mode (but is recognized in legacy
- * mode).
+ * Intel's architecture allows SYSENTER in compatibility mode, but AMD
+ * does not. Note, AMD does allow SYSENTER in legacy protected mode.
*/
- if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA)
- && !vendor_intel(ctxt))
+ if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA) &&
+ !ctxt->ops->guest_cpuid_is_intel_compatible(ctxt))
return emulate_ud(ctxt);
/* sysenter/sysexit have not been tested in 64bit mode. */
@@ -2647,7 +2611,14 @@ static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
* manner when ECX is zero due to REP-string optimizations.
*/
#ifdef CONFIG_X86_64
- if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+ u32 eax, ebx, ecx, edx;
+
+ if (ctxt->ad_bytes != 4)
+ return;
+
+ eax = ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
+ if (!is_guest_vendor_intel(ebx, ecx, edx))
return;
*reg_write(ctxt, VCPU_REGS_RCX) = 0;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 8a47f8541eab..4f0a94346d00 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1417,7 +1417,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
}
/* vmcall/vmmcall */
- static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i);
+ kvm_x86_call(patch_hypercall)(vcpu, instructions + i);
i += 3;
/* ret */
@@ -1737,7 +1737,8 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
data = (u64)vcpu->arch.virtual_tsc_khz * 1000;
break;
case HV_X64_MSR_APIC_FREQUENCY:
- data = APIC_BUS_FREQUENCY;
+ data = div64_u64(1000000000ULL,
+ vcpu->kvm->arch.apic_bus_cycle_ns);
break;
default:
kvm_pr_unimpl_rdmsr(vcpu, msr);
@@ -1985,7 +1986,7 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
*/
gva = entries[i] & PAGE_MASK;
for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++)
- static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
+ kvm_x86_call(flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
++vcpu->stat.tlb_flush;
}
@@ -2526,7 +2527,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
* hypercall generates UD from non zero cpl and real mode
* per HYPER-V spec
*/
- if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
+ if (kvm_x86_call(get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 923e64903da9..913bfc96959c 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -286,7 +286,6 @@ static inline int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
return HV_STATUS_ACCESS_DENIED;
}
static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) {}
-static inline void kvm_hv_free_pa_page(struct kvm *kvm) {}
static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector)
{
return false;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index ad9ca8a60144..63f66c51975a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
* Read pending interrupt(from non-APIC source)
* vector and intack.
*/
-static int kvm_cpu_get_extint(struct kvm_vcpu *v)
+int kvm_cpu_get_extint(struct kvm_vcpu *v)
{
if (!kvm_cpu_has_extint(v)) {
WARN_ON(!lapic_in_kernel(v));
@@ -131,6 +131,7 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
} else
return kvm_pic_read_irq(v->kvm); /* PIC */
}
+EXPORT_SYMBOL_GPL(kvm_cpu_get_extint);
/*
* Read pending interrupt vector and intack.
@@ -141,9 +142,12 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
if (vector != -1)
return vector; /* PIC */
- return kvm_get_apic_interrupt(v); /* APIC */
+ vector = kvm_apic_has_interrupt(v); /* APIC */
+ if (vector != -1)
+ kvm_apic_ack_interrupt(v, vector);
+
+ return vector;
}
-EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
@@ -157,7 +161,7 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
__kvm_migrate_pit_timer(vcpu);
- static_call_cond(kvm_x86_migrate_timers)(vcpu);
+ kvm_x86_call(migrate_timers)(vcpu);
}
bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index c2d7cfe82d00..76d46b2f41dd 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -106,7 +106,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
int kvm_setup_default_irq_routing(struct kvm *kvm);
-int kvm_setup_empty_irq_routing(struct kvm *kvm);
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 68f3f6c26046..8136695f7b96 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -395,13 +395,6 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
ARRAY_SIZE(default_routing), 0);
}
-static const struct kvm_irq_routing_entry empty_routing[] = {};
-
-int kvm_setup_empty_irq_routing(struct kvm *kvm)
-{
- return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
-}
-
void kvm_arch_post_irq_routing_update(struct kvm *kvm)
{
if (!irqchip_split(kvm))
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 75eae9c4998a..b1eb46e26b2e 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -98,7 +98,7 @@ static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg
return 0;
if (!kvm_register_is_available(vcpu, reg))
- static_call(kvm_x86_cache_reg)(vcpu, reg);
+ kvm_x86_call(cache_reg)(vcpu, reg);
return vcpu->arch.regs[reg];
}
@@ -138,7 +138,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
might_sleep(); /* on svm */
if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
- static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR);
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_PDPTR);
return vcpu->arch.walk_mmu->pdptrs[index];
}
@@ -153,7 +153,7 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
!kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
- static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0);
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR0);
return vcpu->arch.cr0 & mask;
}
@@ -175,7 +175,7 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
!kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
- static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4);
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR4);
return vcpu->arch.cr4 & mask;
}
@@ -190,7 +190,7 @@ static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu,
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
- static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3);
+ kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR3);
return vcpu->arch.cr3;
}
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 5382646162a3..55a18e2f2dcd 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -26,6 +26,7 @@ struct x86_exception {
bool nested_page_fault;
u64 address; /* cr2 or nested page fault gpa */
u8 async_page_fault;
+ unsigned long exit_qualification;
};
/*
@@ -222,6 +223,7 @@ struct x86_emulate_ops {
bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_cpuid_is_intel_compatible)(struct x86_emulate_ctxt *ctxt);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ebf41023be38..2098dc689088 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -59,7 +59,17 @@
#define MAX_APIC_VECTOR 256
#define APIC_VECTORS_PER_REG 32
-static bool lapic_timer_advance_dynamic __read_mostly;
+/*
+ * Enable local APIC timer advancement (tscdeadline mode only) with adaptive
+ * tuning. When enabled, KVM programs the host timer event to fire early, i.e.
+ * before the deadline expires, to account for the delay between taking the
+ * VM-Exit (to inject the guest event) and the subsequent VM-Enter to resume
+ * the guest, i.e. so that the interrupt arrives in the guest with minimal
+ * latency relative to the deadline programmed by the guest.
+ */
+static bool lapic_timer_advance __read_mostly = true;
+module_param(lapic_timer_advance, bool, 0444);
+
#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */
#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */
#define LAPIC_TIMER_ADVANCE_NS_INIT 1000
@@ -341,10 +351,8 @@ static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
* reversing the LDR calculation to get cluster of APICs, i.e. no
* additional work is required.
*/
- if (apic_x2apic_mode(apic)) {
- WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic)));
+ if (apic_x2apic_mode(apic))
return;
- }
if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
&cluster, &mask))) {
@@ -728,8 +736,8 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
if (unlikely(apic->apicv_active)) {
/* need to update RVI */
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
- static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu,
- apic_find_highest_irr(apic));
+ kvm_x86_call(hwapic_irr_update)(apic->vcpu,
+ apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -755,7 +763,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* just set SVI.
*/
if (unlikely(apic->apicv_active))
- static_call_cond(kvm_x86_hwapic_isr_update)(vec);
+ kvm_x86_call(hwapic_isr_update)(vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -800,7 +808,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* and must be left alone.
*/
if (unlikely(apic->apicv_active))
- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
+ kvm_x86_call(hwapic_isr_update)(apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -936,7 +944,7 @@ static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
if (kvm_x86_ops.sync_pir_to_irr)
- highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
+ highest_irr = kvm_x86_call(sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
@@ -1328,8 +1336,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic->regs + APIC_TMR);
}
- static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode,
- trig_mode, vector);
+ kvm_x86_call(deliver_interrupt)(apic, delivery_mode,
+ trig_mode, vector);
break;
case APIC_DM_REMRD:
@@ -1547,7 +1555,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
remaining = 0;
ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
- return div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->divide_count));
+ return div64_u64(ns, (apic->vcpu->kvm->arch.apic_bus_cycle_ns *
+ apic->divide_count));
}
static void __report_tpr_access(struct kvm_lapic *apic, bool write)
@@ -1732,7 +1741,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
s64 min_period = min_timer_period_us * 1000LL;
if (apic->lapic_timer.period < min_period) {
- pr_info_ratelimited(
+ pr_info_once(
"vcpu %i: requested %lld ns "
"lapic timer period limited to %lld ns\n",
apic->vcpu->vcpu_id,
@@ -1854,16 +1863,14 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
- if (lapic_timer_advance_dynamic) {
- adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline);
- /*
- * If the timer fired early, reread the TSC to account for the
- * overhead of the above adjustment to avoid waiting longer
- * than is necessary.
- */
- if (guest_tsc < tsc_deadline)
- guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
- }
+ adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline);
+
+ /*
+ * If the timer fired early, reread the TSC to account for the overhead
+ * of the above adjustment to avoid waiting longer than is necessary.
+ */
+ if (guest_tsc < tsc_deadline)
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
@@ -1937,7 +1944,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
u64 ns = 0;
ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
- unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz;
unsigned long flags;
ktime_t now;
@@ -1965,7 +1972,8 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict)
{
- return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count;
+ return (u64)tmict * apic->vcpu->kvm->arch.apic_bus_cycle_ns *
+ (u64)apic->divide_count;
}
static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
@@ -2095,7 +2103,7 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
{
WARN_ON(preemptible());
WARN_ON(!apic->lapic_timer.hv_timer_in_use);
- static_call(kvm_x86_cancel_hv_timer)(apic->vcpu);
+ kvm_x86_call(cancel_hv_timer)(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
}
@@ -2112,7 +2120,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
if (!ktimer->tscdeadline)
return false;
- if (static_call(kvm_x86_set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
+ if (kvm_x86_call(set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
return false;
ktimer->hv_timer_in_use = true;
@@ -2445,6 +2453,43 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
+#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13))
+
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
+{
+ if (data & X2APIC_ICR_RESERVED_BITS)
+ return 1;
+
+ /*
+ * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but
+ * only AMD requires it to be zero, Intel essentially just ignores the
+ * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled,
+ * the CPU performs the reserved bits checks, i.e. the underlying CPU
+ * behavior will "win". Arbitrarily clear the BUSY bit, as there is no
+ * sane way to provide consistent behavior with respect to hardware.
+ */
+ data &= ~APIC_ICR_BUSY;
+
+ kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+ if (kvm_x86_ops.x2apic_icr_is_split) {
+ kvm_lapic_set_reg(apic, APIC_ICR, data);
+ kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
+ } else {
+ kvm_lapic_set_reg64(apic, APIC_ICR, data);
+ }
+ trace_kvm_apic_write(APIC_ICR, data);
+ return 0;
+}
+
+static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
+{
+ if (kvm_x86_ops.x2apic_icr_is_split)
+ return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
+ (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
+
+ return kvm_lapic_get_reg64(apic, APIC_ICR);
+}
+
/* emulate APIC access in a trap manner */
void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
{
@@ -2462,7 +2507,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
* maybe-unecessary write, and both are in the noise anyways.
*/
if (apic_x2apic_mode(apic) && offset == APIC_ICR)
- kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR));
+ WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
else
kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
}
@@ -2567,7 +2612,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
- static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
+ kvm_x86_call(set_virtual_apic_mode)(vcpu);
}
apic->base_address = apic->vcpu->arch.apic_base &
@@ -2677,7 +2722,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
u64 msr_val;
int i;
- static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu);
+ kvm_x86_call(apicv_pre_state_restore)(vcpu);
if (!init_event) {
msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
@@ -2732,9 +2777,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
if (apic->apicv_active) {
- static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1);
- static_call_cond(kvm_x86_hwapic_isr_update)(-1);
+ kvm_x86_call(apicv_post_state_restore)(vcpu);
+ kvm_x86_call(hwapic_irr_update)(vcpu, -1);
+ kvm_x86_call(hwapic_isr_update)(-1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -2812,7 +2857,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
return HRTIMER_NORESTART;
}
-int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
+int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
@@ -2830,7 +2875,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
vcpu->arch.apic = apic;
if (kvm_x86_ops.alloc_apic_backing_page)
- apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu);
+ apic->regs = kvm_x86_call(alloc_apic_backing_page)(vcpu);
else
apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!apic->regs) {
@@ -2845,13 +2890,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_HARD);
apic->lapic_timer.timer.function = apic_timer_fn;
- if (timer_advance_ns == -1) {
+ if (lapic_timer_advance)
apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
- lapic_timer_advance_dynamic = true;
- } else {
- apic->lapic_timer.timer_advance_ns = timer_advance_ns;
- lapic_timer_advance_dynamic = false;
- }
/*
* Stuff the APIC ENABLE bit in lieu of temporarily incrementing
@@ -2919,14 +2959,13 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
}
}
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector)
{
- int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
u32 ppr;
- if (vector == -1)
- return -1;
+ if (WARN_ON_ONCE(vector < 0 || !apic))
+ return;
/*
* We get here even with APIC virtualization enabled, if doing
@@ -2954,41 +2993,55 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
__apic_update_ppr(apic, &ppr);
}
- return vector;
}
+EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt);
static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s, bool set)
{
if (apic_x2apic_mode(vcpu->arch.apic)) {
+ u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic);
u32 *id = (u32 *)(s->regs + APIC_ID);
u32 *ldr = (u32 *)(s->regs + APIC_LDR);
u64 icr;
if (vcpu->kvm->arch.x2apic_format) {
- if (*id != vcpu->vcpu_id)
+ if (*id != x2apic_id)
return -EINVAL;
} else {
+ /*
+ * Ignore the userspace value when setting APIC state.
+ * KVM's model is that the x2APIC ID is readonly, e.g.
+ * KVM only supports delivering interrupts to KVM's
+ * version of the x2APIC ID. However, for backwards
+ * compatibility, don't reject attempts to set a
+ * mismatched ID for userspace that hasn't opted into
+ * x2apic_format.
+ */
if (set)
- *id >>= 24;
+ *id = x2apic_id;
else
- *id <<= 24;
+ *id = x2apic_id << 24;
}
/*
* In x2APIC mode, the LDR is fixed and based on the id. And
- * ICR is internally a single 64-bit register, but needs to be
- * split to ICR+ICR2 in userspace for backwards compatibility.
+ * if the ICR is _not_ split, ICR is internally a single 64-bit
+ * register, but needs to be split to ICR+ICR2 in userspace for
+ * backwards compatibility.
*/
- if (set) {
- *ldr = kvm_apic_calc_x2apic_ldr(*id);
-
- icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
- (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
- __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
- } else {
- icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
- __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ if (set)
+ *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
+
+ if (!kvm_x86_ops.x2apic_icr_is_split) {
+ if (set) {
+ icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+ (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+ __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ } else {
+ icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+ __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ }
}
}
@@ -3014,7 +3067,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
struct kvm_lapic *apic = vcpu->arch.apic;
int r;
- static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu);
+ kvm_x86_call(apicv_pre_state_restore)(vcpu);
kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
/* set SPIV separately to get count of SW disabled APICs right */
@@ -3041,9 +3094,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
kvm_apic_update_apicv(vcpu);
if (apic->apicv_active) {
- static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
- static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
- static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic));
+ kvm_x86_call(apicv_post_state_restore)(vcpu);
+ kvm_x86_call(hwapic_irr_update)(vcpu,
+ apic_find_highest_irr(apic));
+ kvm_x86_call(hwapic_isr_update)(apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (ioapic_in_kernel(vcpu->kvm))
@@ -3180,22 +3234,12 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
return 0;
}
-int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
-{
- data &= ~APIC_ICR_BUSY;
-
- kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
- kvm_lapic_set_reg64(apic, APIC_ICR, data);
- trace_kvm_apic_write(APIC_ICR, data);
- return 0;
-}
-
static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
{
u32 low;
if (reg == APIC_ICR) {
- *data = kvm_lapic_get_reg64(apic, APIC_ICR);
+ *data = kvm_x2apic_icr_read(apic);
return 0;
}
@@ -3331,7 +3375,8 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
/* evaluate pending_events before reading the vector */
smp_rmb();
sipi_vector = apic->sipi_vector;
- static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector);
+ kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu,
+ sipi_vector);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 0a0ea4b5dd8c..1b8ef9856422 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -16,8 +16,7 @@
#define APIC_DEST_NOSHORT 0x0
#define APIC_DEST_MASK 0x800
-#define APIC_BUS_CYCLE_NS 1
-#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
+#define APIC_BUS_CYCLE_NS_DEFAULT 1
#define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul
@@ -85,19 +84,18 @@ struct kvm_lapic {
struct dest_map;
-int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns);
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
void kvm_free_lapic(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
int kvm_apic_accept_events(struct kvm_vcpu *vcpu);
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_recalculate_apic_map(struct kvm *kvm);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);
@@ -236,7 +234,7 @@ static inline bool kvm_apic_has_pending_init_or_sipi(struct kvm_vcpu *vcpu)
static inline bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu)
{
return !is_smm(vcpu) &&
- !static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
+ !kvm_x86_call(apic_init_signal_blocked)(vcpu);
}
static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 60f21bb4c27b..9dc5dd43ae7f 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -57,12 +57,6 @@ static __always_inline u64 rsvd_bits(int s, int e)
return ((2ULL << (e - s)) - 1) << s;
}
-/*
- * The number of non-reserved physical address bits irrespective of features
- * that repurpose legal bits, e.g. MKTME.
- */
-extern u8 __read_mostly shadow_phys_bits;
-
static inline gfn_t kvm_mmu_max_gfn(void)
{
/*
@@ -76,29 +70,12 @@ static inline gfn_t kvm_mmu_max_gfn(void)
* than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
* disallows such SPTEs entirely and simplifies the TDP MMU.
*/
- int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
+ int max_gpa_bits = likely(tdp_enabled) ? kvm_host.maxphyaddr : 52;
return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
}
-static inline u8 kvm_get_shadow_phys_bits(void)
-{
- /*
- * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
- * in CPU detection code, but the processor treats those reduced bits as
- * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
- * the physical address bits reported by CPUID.
- */
- if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
- return cpuid_eax(0x80000008) & 0xff;
-
- /*
- * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
- * custom CPUID. Proceed with whatever the kernel found since these features
- * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
- */
- return boot_cpu_data.x86_phys_bits;
-}
+u8 kvm_mmu_get_max_tdp_level(void);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
@@ -161,8 +138,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(root_hpa))
return;
- static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
- vcpu->arch.mmu->root_role.level);
+ kvm_x86_call(load_mmu_pgd)(vcpu, root_hpa,
+ vcpu->arch.mmu->root_role.level);
}
static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
@@ -197,7 +174,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
{
/* strip nested paging fault error codes */
unsigned int pfec = access;
- unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
/*
* For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1.
@@ -213,7 +190,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
*/
u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
- int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
+ int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1;
u32 errcode = PFERR_PRESENT_MASK;
bool fault;
@@ -234,8 +211,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
- offset = (pfec & ~1) +
- ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
+ offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0);
pkru_bits &= mmu->pkru_mask >> offset;
errcode |= -pkru_bits & PFERR_PK_MASK;
@@ -245,16 +221,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
-bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma);
-
-static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm)
-{
- return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm));
-}
-
-void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
-
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
+bool kvm_mmu_may_ignore_guest_pat(void);
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index db007a4dffa2..a9a23e058555 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -336,16 +336,19 @@ static int is_cpuid_PSE36(void)
#ifdef CONFIG_X86_64
static void __set_spte(u64 *sptep, u64 spte)
{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
WRITE_ONCE(*sptep, spte);
}
static void __update_clear_spte_fast(u64 *sptep, u64 spte)
{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
WRITE_ONCE(*sptep, spte);
}
static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
return xchg(sptep, spte);
}
@@ -432,8 +435,8 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
* The idea using the light way get the spte on x86_32 guest is from
* gup_get_pte (mm/gup.c).
*
- * An spte tlb flush may be pending, because kvm_set_pte_rmap
- * coalesces them and we are running out of the MMU lock. Therefore
+ * An spte tlb flush may be pending, because they are coalesced and
+ * we are running out of the MMU lock. Therefore
* we need to protect against in-progress updates of the spte.
*
* Reading the spte while an update is in progress may get the old value
@@ -567,9 +570,9 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
if (!is_shadow_present_pte(old_spte) ||
!spte_has_volatile_bits(old_spte))
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
else
- old_spte = __update_clear_spte_slow(sptep, 0ull);
+ old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
if (!is_shadow_present_pte(old_spte))
return old_spte;
@@ -603,7 +606,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
*/
static void mmu_spte_clear_no_track(u64 *sptep)
{
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
}
static u64 mmu_spte_get_lockless(u64 *sptep)
@@ -611,32 +614,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
return __get_spte_lockless(sptep);
}
-/* Returns the Accessed status of the PTE and resets it at the same time. */
-static bool mmu_spte_age(u64 *sptep)
-{
- u64 spte = mmu_spte_get_lockless(sptep);
-
- if (!is_accessed_spte(spte))
- return false;
-
- if (spte_ad_enabled(spte)) {
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)sptep);
- } else {
- /*
- * Capture the dirty status of the page, so that it doesn't get
- * lost when the SPTE is marked for access tracking.
- */
- if (is_writable_pte(spte))
- kvm_set_pfn_dirty(spte_to_pfn(spte));
-
- spte = mark_spte_for_access_track(spte);
- mmu_spte_update_no_track(sptep, spte);
- }
-
- return true;
-}
-
static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
{
return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
@@ -719,7 +696,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
if (sp->role.passthrough)
return sp->gfn;
- if (!sp->role.direct)
+ if (sp->shadowed_translation)
return sp->shadowed_translation[index] >> PAGE_SHIFT;
return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
@@ -733,7 +710,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
*/
static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
{
- if (sp_has_gptes(sp))
+ if (sp->shadowed_translation)
return sp->shadowed_translation[index] & ACC_ALL;
/*
@@ -754,7 +731,7 @@ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
gfn_t gfn, unsigned int access)
{
- if (sp_has_gptes(sp)) {
+ if (sp->shadowed_translation) {
sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
return;
}
@@ -831,6 +808,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
gfn_t gfn;
kvm->arch.indirect_shadow_pages++;
+ /*
+ * Ensure indirect_shadow_pages is elevated prior to re-reading guest
+ * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight
+ * emulated writes are visible before re-reading guest PTEs, or that
+ * an emulated write will see the elevated count and acquire mmu_lock
+ * to update SPTEs. Pairs with the smp_mb() in kvm_mmu_track_write().
+ */
+ smp_mb();
+
gfn = sp->gfn;
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, gfn);
@@ -926,6 +912,7 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
* in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
* pte_list_desc containing more mappings.
*/
+#define KVM_RMAP_MANY BIT(0)
/*
* Returns the number of pointers in the rmap chain, not counting the new one.
@@ -938,16 +925,16 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
if (!rmap_head->val) {
rmap_head->val = (unsigned long)spte;
- } else if (!(rmap_head->val & 1)) {
+ } else if (!(rmap_head->val & KVM_RMAP_MANY)) {
desc = kvm_mmu_memory_cache_alloc(cache);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
desc->spte_count = 2;
desc->tail_count = 0;
- rmap_head->val = (unsigned long)desc | 1;
+ rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
++count;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
count = desc->tail_count + desc->spte_count;
/*
@@ -956,10 +943,10 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
*/
if (desc->spte_count == PTE_LIST_EXT) {
desc = kvm_mmu_memory_cache_alloc(cache);
- desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
desc->spte_count = 0;
desc->tail_count = count;
- rmap_head->val = (unsigned long)desc | 1;
+ rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
}
desc->sptes[desc->spte_count++] = spte;
}
@@ -970,7 +957,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
struct pte_list_desc *desc, int i)
{
- struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
int j = head_desc->spte_count - 1;
/*
@@ -999,7 +986,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
if (!head_desc->more)
rmap_head->val = 0;
else
- rmap_head->val = (unsigned long)head_desc->more | 1;
+ rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
mmu_free_pte_list_desc(head_desc);
}
@@ -1012,13 +999,13 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
return;
- if (!(rmap_head->val & 1)) {
+ if (!(rmap_head->val & KVM_RMAP_MANY)) {
if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
return;
rmap_head->val = 0;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
@@ -1051,12 +1038,12 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
if (!rmap_head->val)
return false;
- if (!(rmap_head->val & 1)) {
+ if (!(rmap_head->val & KVM_RMAP_MANY)) {
mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
goto out;
}
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
for (; desc; desc = next) {
for (i = 0; i < desc->spte_count; i++)
@@ -1076,10 +1063,10 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
if (!rmap_head->val)
return 0;
- else if (!(rmap_head->val & 1))
+ else if (!(rmap_head->val & KVM_RMAP_MANY))
return 1;
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
return desc->tail_count + desc->spte_count;
}
@@ -1141,13 +1128,13 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
if (!rmap_head->val)
return NULL;
- if (!(rmap_head->val & 1)) {
+ if (!(rmap_head->val & KVM_RMAP_MANY)) {
iter->desc = NULL;
sptep = (u64 *)rmap_head->val;
goto out;
}
- iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
iter->pos = 0;
sptep = iter->desc->sptes[iter->pos];
out:
@@ -1295,15 +1282,6 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
return flush;
}
-/**
- * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
- * @kvm: kvm instance
- * @slot: slot to protect
- * @gfn_offset: start of the BITS_PER_LONG pages we care about
- * @mask: indicates which pages we should protect
- *
- * Used when we do not need to care about huge page mappings.
- */
static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
@@ -1327,16 +1305,6 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
}
}
-/**
- * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
- * protect the page if the D-bit isn't supported.
- * @kvm: kvm instance
- * @slot: slot to clear D-bit
- * @gfn_offset: start of the BITS_PER_LONG pages we care about
- * @mask: indicates which pages we should clear D-bit
- *
- * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
- */
static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
@@ -1360,24 +1328,16 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
}
}
-/**
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * PT level pages.
- *
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
- *
- * We need to care about huge page mappings: e.g. during dirty logging we may
- * have such mappings.
- */
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
/*
- * Huge pages are NOT write protected when we start dirty logging in
- * initially-all-set mode; must write protect them here so that they
- * are split to 4K on the first write.
+ * If the slot was assumed to be "initially all dirty", write-protect
+ * huge pages to ensure they are split to 4KiB on the first write (KVM
+ * dirty logs at 4KiB granularity). If eager page splitting is enabled,
+ * immediately try to split huge pages, e.g. so that vCPUs don't get
+ * saddled with the cost of splitting.
*
* The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
* of memslot has no such restriction, so the range can cross two large
@@ -1399,7 +1359,16 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
PG_LEVEL_2M);
}
- /* Now handle 4K PTEs. */
+ /*
+ * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in
+ * mask. If PML is enabled and the GFN doesn't need to be write-
+ * protected for other reasons, e.g. shadow paging, clear the Dirty bit.
+ * Otherwise clear the Writable bit.
+ *
+ * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is
+ * enabled but it chooses between clearing the Dirty bit and Writeable
+ * bit based on the context.
+ */
if (kvm_x86_ops.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
else
@@ -1441,54 +1410,10 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
}
-static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- const struct kvm_memory_slot *slot)
-{
- return kvm_zap_all_rmap_sptes(kvm, rmap_head);
-}
-
static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t unused)
-{
- return __kvm_zap_rmap(kvm, rmap_head, slot);
-}
-
-static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t pte)
+ const struct kvm_memory_slot *slot)
{
- u64 *sptep;
- struct rmap_iterator iter;
- bool need_flush = false;
- u64 new_spte;
- kvm_pfn_t new_pfn;
-
- WARN_ON_ONCE(pte_huge(pte));
- new_pfn = pte_pfn(pte);
-
-restart:
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- need_flush = true;
-
- if (pte_write(pte)) {
- kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
- goto restart;
- } else {
- new_spte = kvm_mmu_changed_pte_notifier_make_spte(
- *sptep, new_pfn);
-
- mmu_spte_clear_track_bits(kvm, sptep);
- mmu_spte_set(sptep, new_spte);
- }
- }
-
- if (need_flush && kvm_available_flush_remote_tlbs_range()) {
- kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
- return false;
- }
-
- return need_flush;
+ return kvm_zap_all_rmap_sptes(kvm, rmap_head);
}
struct slot_rmap_walk_iterator {
@@ -1539,7 +1464,7 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
{
while (++iterator->rmap <= iterator->end_rmap) {
- iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+ iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
if (iterator->rmap->val)
return;
@@ -1560,80 +1485,90 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
slot_rmap_walk_okay(_iter_); \
slot_rmap_walk_next(_iter_))
-typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level, pte_t pte);
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot);
-static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
- struct kvm_gfn_range *range,
- rmap_handler_t handler)
+static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn,
+ bool can_yield, bool flush_on_yield,
+ bool flush)
{
struct slot_rmap_walk_iterator iterator;
- bool ret = false;
-
- for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- range->start, range->end - 1, &iterator)
- ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
- iterator.level, range->arg.pte);
- return ret;
-}
-
-bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- bool flush = false;
+ lockdep_assert_held_write(&kvm->mmu_lock);
- if (kvm_memslots_have_rmaps(kvm))
- flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
+ for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap, slot);
- if (tdp_mmu_enabled)
- flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
+ if (!can_yield)
+ continue;
- if (kvm_x86_ops.set_apic_access_page_addr &&
- range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
- kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ if (flush && flush_on_yield) {
+ kvm_flush_remote_tlbs_range(kvm, start_gfn,
+ iterator.gfn - start_gfn + 1);
+ flush = false;
+ }
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ }
+ }
return flush;
}
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ bool flush_on_yield)
{
- bool flush = false;
-
- if (kvm_memslots_have_rmaps(kvm))
- flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
+ return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
+ slot->base_gfn, slot->base_gfn + slot->npages - 1,
+ true, flush_on_yield, false);
+}
- if (tdp_mmu_enabled)
- flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
+static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ bool flush_on_yield)
+{
+ return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
+}
- return flush;
+static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end, bool can_yield,
+ bool flush)
+{
+ return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, can_yield, true, flush);
}
-static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t unused)
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- u64 *sptep;
- struct rmap_iterator iter;
- int young = 0;
+ bool flush = false;
- for_each_rmap_spte(rmap_head, &iter, sptep)
- young |= mmu_spte_age(sptep);
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = __kvm_rmap_zap_gfn_range(kvm, range->slot,
+ range->start, range->end,
+ range->may_block, flush);
- return young;
-}
+ if (tdp_mmu_enabled)
+ flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
-static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level, pte_t unused)
-{
- u64 *sptep;
- struct rmap_iterator iter;
+ if (kvm_x86_ops.set_apic_access_page_addr &&
+ range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
- for_each_rmap_spte(rmap_head, &iter, sptep)
- if (is_accessed_spte(*sptep))
- return true;
- return false;
+ return flush;
}
#define RMAP_RECYCLE_THRESHOLD 1000
@@ -1670,12 +1605,52 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
__rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
}
+static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range, bool test_only)
+{
+ struct slot_rmap_walk_iterator iterator;
+ struct rmap_iterator iter;
+ bool young = false;
+ u64 *sptep;
+
+ for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ range->start, range->end - 1, &iterator) {
+ for_each_rmap_spte(iterator.rmap, &iter, sptep) {
+ u64 spte = *sptep;
+
+ if (!is_accessed_spte(spte))
+ continue;
+
+ if (test_only)
+ return true;
+
+ if (spte_ad_enabled(spte)) {
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ } else {
+ /*
+ * Capture the dirty status of the page, so that
+ * it doesn't get lost when the SPTE is marked
+ * for access tracking.
+ */
+ if (is_writable_pte(spte))
+ kvm_set_pfn_dirty(spte_to_pfn(spte));
+
+ spte = mark_spte_for_access_track(spte);
+ mmu_spte_update_no_track(sptep, spte);
+ }
+ young = true;
+ }
+ }
+ return young;
+}
+
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
- young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
+ young = kvm_rmap_age_gfn_range(kvm, range, false);
if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@@ -1688,7 +1663,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
- young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
+ young = kvm_rmap_age_gfn_range(kvm, range, true);
if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
@@ -1741,8 +1716,7 @@ static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
hlist_del(&sp->hash_link);
list_del(&sp->link);
free_page((unsigned long)sp->spt);
- if (!sp->role.direct)
- free_page((unsigned long)sp->shadowed_translation);
+ free_page((unsigned long)sp->shadowed_translation);
kmem_cache_free(mmu_page_header_cache, sp);
}
@@ -1910,10 +1884,14 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
if (is_obsolete_sp((_kvm), (_sp))) { \
} else
-#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
+#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \
for_each_valid_sp(_kvm, _sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
- if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
+ if ((_sp)->gfn != (_gfn)) {} else
+
+#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
+ for_each_gfn_valid_sp(_kvm, _sp, _gfn) \
+ if (!sp_has_gptes(_sp)) {} else
static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
@@ -1950,7 +1928,8 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{
- if (!sp->spt[i])
+ /* sp->spt[i] has initial value of shadow page table allocation */
+ if (sp->spt[i] == SHADOW_NONPRESENT_VALUE)
return 0;
return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
@@ -2243,7 +2222,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
- if (!role.direct)
+ if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL)
sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
@@ -2514,7 +2493,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
return kvm_mmu_prepare_zap_page(kvm, child,
invalid_list);
}
- } else if (is_mmio_spte(pte)) {
+ } else if (is_mmio_spte(kvm, pte)) {
mmu_spte_clear_no_track(spte);
}
return 0;
@@ -2754,36 +2733,49 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
write_unlock(&kvm->mmu_lock);
}
-int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool always_retry)
{
- struct kvm_mmu_page *sp;
+ struct kvm *kvm = vcpu->kvm;
LIST_HEAD(invalid_list);
- int r;
+ struct kvm_mmu_page *sp;
+ gpa_t gpa = cr2_or_gpa;
+ bool r = false;
+
+ /*
+ * Bail early if there aren't any write-protected shadow pages to avoid
+ * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked
+ * by a third party. Reading indirect_shadow_pages without holding
+ * mmu_lock is safe, as this is purely an optimization, i.e. a false
+ * positive is benign, and a false negative will simply result in KVM
+ * skipping the unprotect+retry path, which is also an optimization.
+ */
+ if (!READ_ONCE(kvm->arch.indirect_shadow_pages))
+ goto out;
+
+ if (!vcpu->arch.mmu->root_role.direct) {
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+ if (gpa == INVALID_GPA)
+ goto out;
+ }
- r = 0;
write_lock(&kvm->mmu_lock);
- for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
- r = 1;
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa))
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- }
+
+ /*
+ * Snapshot the result before zapping, as zapping will remove all list
+ * entries, i.e. checking the list later would yield a false negative.
+ */
+ r = !list_empty(&invalid_list);
kvm_mmu_commit_zap_page(kvm, &invalid_list);
write_unlock(&kvm->mmu_lock);
- return r;
-}
-
-static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa;
- int r;
-
- if (vcpu->arch.mmu->root_role.direct)
- return 0;
-
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-
- r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
+out:
+ if (r || always_retry) {
+ vcpu->arch.last_retry_eip = kvm_rip_read(vcpu);
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
+ }
return r;
}
@@ -2955,10 +2947,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
trace_kvm_mmu_set_spte(level, gfn, sptep);
}
- if (wrprot) {
- if (write_fault)
- ret = RET_PF_EMULATE;
- }
+ if (wrprot && write_fault)
+ ret = RET_PF_WRITE_PROTECTED;
if (flush)
kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
@@ -3314,9 +3304,19 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
{
gva_t gva = fault->is_tdp ? 0 : fault->addr;
+ if (fault->is_private) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
access & shadow_mmio_access_mask);
+ fault->slot = NULL;
+ fault->pfn = KVM_PFN_NOSLOT;
+ fault->map_writable = false;
+ fault->hva = KVM_HVA_ERR_BAD;
+
/*
* If MMIO caching is disabled, emulate immediately without
* touching the shadow page tables as attempting to install an
@@ -3338,7 +3338,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
return RET_PF_CONTINUE;
}
-static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
+static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault)
{
/*
* Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
@@ -3350,6 +3350,26 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
return false;
/*
+ * For hardware-protected VMs, certain conditions like attempting to
+ * perform a write to a page which is not in the state that the guest
+ * expects it to be in can result in a nested/extended #PF. In this
+ * case, the below code might misconstrue this situation as being the
+ * result of a write-protected access, and treat it as a spurious case
+ * rather than taking any action to satisfy the real source of the #PF
+ * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the
+ * guest spinning on a #PF indefinitely, so don't attempt the fast path
+ * in this case.
+ *
+ * Note that the kvm_mem_is_private() check might race with an
+ * attribute update, but this will either result in the guest spinning
+ * on RET_PF_SPURIOUS until the update completes, or an actual spurious
+ * case might go down the slow path. Either case will resolve itself.
+ */
+ if (kvm->arch.has_private_mem &&
+ fault->is_private != kvm_mem_is_private(kvm, fault->gfn))
+ return false;
+
+ /*
* #PF can be fast if:
*
* 1. The shadow page table entry is not present and A/D bits are
@@ -3449,7 +3469,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
u64 *sptep;
uint retry_count = 0;
- if (!page_fault_can_be_fast(fault))
+ if (!page_fault_can_be_fast(vcpu->kvm, fault))
return ret;
walk_shadow_page_lockless_begin(vcpu);
@@ -3458,7 +3478,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
u64 new_spte;
if (tdp_mmu_enabled)
- sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
+ sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte);
else
sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
@@ -3468,7 +3488,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* available as the vCPU holds a reference to its root(s).
*/
if (WARN_ON_ONCE(!sptep))
- spte = REMOVED_SPTE;
+ spte = FROZEN_SPTE;
if (!is_shadow_present_pte(spte))
break;
@@ -4134,23 +4154,31 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level
return leaf;
}
-/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
-static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
+ int *root_level)
{
- u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
- struct rsvd_bits_validate *rsvd_check;
- int root, leaf, level;
- bool reserved = false;
+ int leaf;
walk_shadow_page_lockless_begin(vcpu);
if (is_tdp_mmu_active(vcpu))
- leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
+ leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level);
else
- leaf = get_walk(vcpu, addr, sptes, &root);
+ leaf = get_walk(vcpu, addr, sptes, root_level);
walk_shadow_page_lockless_end(vcpu);
+ return leaf;
+}
+/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
+static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+{
+ u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
+ struct rsvd_bits_validate *rsvd_check;
+ int root, leaf, level;
+ bool reserved = false;
+
+ leaf = get_sptes_lockless(vcpu, addr, sptes, &root);
if (unlikely(leaf < 0)) {
*sptep = 0ull;
return reserved;
@@ -4196,7 +4224,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
if (WARN_ON_ONCE(reserved))
return -EINVAL;
- if (is_mmio_spte(spte)) {
+ if (is_mmio_spte(vcpu->kvm, spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned int access = get_mmio_spte_access(spte);
@@ -4259,24 +4287,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
}
-static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- gfn_t gfn)
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
{
struct kvm_arch_async_pf arch;
arch.token = alloc_apf_token(vcpu);
- arch.gfn = gfn;
+ arch.gfn = fault->gfn;
+ arch.error_code = fault->error_code;
arch.direct_map = vcpu->arch.mmu->root_role.direct;
arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
- return kvm_setup_async_pf(vcpu, cr2_or_gpa,
- kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+ return kvm_setup_async_pf(vcpu, fault->addr,
+ kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
}
void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
int r;
+ if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
+ return;
+
if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
work->wakeup_all)
return;
@@ -4289,7 +4321,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
return;
- kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
+ r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
+ true, NULL, NULL);
+
+ /*
+ * Account fixed page faults, otherwise they'll never be counted, but
+ * ignore stats for all other return times. Page-ready "faults" aren't
+ * truly spurious and never trigger emulation
+ */
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
}
static inline u8 kvm_max_level_for_order(int order)
@@ -4309,12 +4350,23 @@ static inline u8 kvm_max_level_for_order(int order)
return PG_LEVEL_4K;
}
-static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault)
+static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
+ u8 max_level, int gmem_order)
{
- kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
- PAGE_SIZE, fault->write, fault->exec,
- fault->is_private);
+ u8 req_max_level;
+
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
+ max_level = min(kvm_max_level_for_order(gmem_order), max_level);
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
+ req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn);
+ if (req_max_level)
+ max_level = min(max_level, req_max_level);
+
+ return max_level;
}
static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
@@ -4334,57 +4386,24 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
return r;
}
- fault->max_level = min(kvm_max_level_for_order(max_order),
- fault->max_level);
fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
+ fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
+ fault->max_level, max_order);
return RET_PF_CONTINUE;
}
static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- struct kvm_memory_slot *slot = fault->slot;
bool async;
- /*
- * Retry the page fault if the gfn hit a memslot that is being deleted
- * or moved. This ensures any existing SPTEs for the old memslot will
- * be zapped before KVM inserts a new MMIO SPTE for the gfn.
- */
- if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
- return RET_PF_RETRY;
-
- if (!kvm_is_visible_memslot(slot)) {
- /* Don't expose private memslots to L2. */
- if (is_guest_mode(vcpu)) {
- fault->slot = NULL;
- fault->pfn = KVM_PFN_NOSLOT;
- fault->map_writable = false;
- return RET_PF_CONTINUE;
- }
- /*
- * If the APIC access page exists but is disabled, go directly
- * to emulation without caching the MMIO access or creating a
- * MMIO SPTE. That way the cache doesn't need to be purged
- * when the AVIC is re-enabled.
- */
- if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
- !kvm_apicv_activated(vcpu->kvm))
- return RET_PF_EMULATE;
- }
-
- if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
- kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
- return -EFAULT;
- }
-
if (fault->is_private)
return kvm_faultin_pfn_private(vcpu, fault);
async = false;
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
- fault->write, &fault->map_writable,
- &fault->hva);
+ fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false,
+ &async, fault->write,
+ &fault->map_writable, &fault->hva);
if (!async)
return RET_PF_CONTINUE; /* *pfn has correct page already */
@@ -4394,7 +4413,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
return RET_PF_RETRY;
- } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
+ } else if (kvm_arch_setup_async_pf(vcpu, fault)) {
return RET_PF_RETRY;
}
}
@@ -4404,21 +4423,73 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* to wait for IO. Note, gup always bails if it is unable to quickly
* get a page and a fatal signal, i.e. SIGKILL, is pending.
*/
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
- fault->write, &fault->map_writable,
- &fault->hva);
+ fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true,
+ NULL, fault->write,
+ &fault->map_writable, &fault->hva);
return RET_PF_CONTINUE;
}
static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
unsigned int access)
{
+ struct kvm_memory_slot *slot = fault->slot;
int ret;
+ /*
+ * Note that the mmu_invalidate_seq also serves to detect a concurrent
+ * change in attributes. is_page_fault_stale() will detect an
+ * invalidation relate to fault->fn and resume the guest without
+ * installing a mapping in the page tables.
+ */
fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
smp_rmb();
/*
+ * Now that we have a snapshot of mmu_invalidate_seq we can check for a
+ * private vs. shared mismatch.
+ */
+ if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ if (unlikely(!slot))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * Retry the page fault if the gfn hit a memslot that is being deleted
+ * or moved. This ensures any existing SPTEs for the old memslot will
+ * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+ */
+ if (slot->flags & KVM_MEMSLOT_INVALID)
+ return RET_PF_RETRY;
+
+ if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
+ /*
+ * Don't map L1's APIC access page into L2, KVM doesn't support
+ * using APICv/AVIC to accelerate L2 accesses to L1's APIC,
+ * i.e. the access needs to be emulated. Emulating access to
+ * L1's APIC is also correct if L1 is accelerating L2's own
+ * virtual APIC, but for some reason L1 also maps _L1's_ APIC
+ * into L2. Note, vcpu_is_mmio_gpa() always treats access to
+ * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM
+ * uses different roots for L1 vs. L2, i.e. there is no danger
+ * of breaking APICv/AVIC for L1.
+ */
+ if (is_guest_mode(vcpu))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * If the APIC access page exists but is disabled, go directly
+ * to emulation without caching the MMIO access or creating a
+ * MMIO SPTE. That way the cache doesn't need to be purged
+ * when the AVIC is re-enabled.
+ */
+ if (!kvm_apicv_activated(vcpu->kvm))
+ return RET_PF_EMULATE;
+ }
+
+ /*
* Check for a relevant mmu_notifier invalidation event before getting
* the pfn from the primary MMU, and before acquiring mmu_lock.
*
@@ -4439,8 +4510,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
* *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
* to detect retry guarantees the worst case latency for the vCPU.
*/
- if (fault->slot &&
- mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
+ if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
return RET_PF_RETRY;
ret = __kvm_faultin_pfn(vcpu, fault);
@@ -4450,7 +4520,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (unlikely(is_error_pfn(fault->pfn)))
return kvm_handle_error_pfn(vcpu, fault);
- if (unlikely(!fault->slot))
+ if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn)))
return kvm_handle_noslot_fault(vcpu, fault, access);
/*
@@ -4510,7 +4580,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
return RET_PF_RETRY;
if (page_fault_handle_page_track(vcpu, fault))
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
r = fast_page_fault(vcpu, fault);
if (r != RET_PF_INVALID)
@@ -4561,13 +4631,24 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
if (WARN_ON_ONCE(fault_address >> 32))
return -EFAULT;
#endif
+ /*
+ * Legacy #PF exception only have a 32-bit error code. Simply drop the
+ * upper bits as KVM doesn't use them for #PF (because they are never
+ * set), and to ensure there are no collisions with KVM-defined bits.
+ */
+ if (WARN_ON_ONCE(error_code >> 32))
+ error_code = lower_32_bits(error_code);
+
+ /*
+ * Restrict KVM-defined flags to bits 63:32 so that it's impossible for
+ * them to conflict with #PF error codes, which are limited to 32 bits.
+ */
+ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
vcpu->arch.l1tf_flush_l1d = true;
if (!flags) {
trace_kvm_page_fault(vcpu, fault_address, error_code);
- if (kvm_event_needs_reinjection(vcpu))
- kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
} else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
@@ -4590,7 +4671,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
int r;
if (page_fault_handle_page_track(vcpu, fault))
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
r = fast_page_fault(vcpu, fault);
if (r != RET_PF_INVALID)
@@ -4619,38 +4700,21 @@ out_unlock:
}
#endif
-bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
+bool kvm_mmu_may_ignore_guest_pat(void)
{
/*
- * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
- * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
- * to honor the memtype from the guest's MTRRs so that guest accesses
- * to memory that is DMA'd aren't cached against the guest's wishes.
- *
- * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
- * e.g. KVM will force UC memtype for host MMIO.
+ * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM
+ * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
+ * honor the memtype from the guest's PAT so that guest accesses to
+ * memory that is DMA'd aren't cached against the guest's wishes. As a
+ * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
+ * KVM _always_ ignores guest PAT (when EPT is enabled).
*/
- return vm_has_noncoherent_dma && shadow_memtype_mask;
+ return shadow_memtype_mask;
}
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- /*
- * If the guest's MTRRs may be used to compute the "real" memtype,
- * restrict the mapping level to ensure KVM uses a consistent memtype
- * across the entire mapping.
- */
- if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
- for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
- int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
- gfn_t base = gfn_round_for_level(fault->gfn,
- fault->max_level);
-
- if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
- break;
- }
- }
-
#ifdef CONFIG_X86_64
if (tdp_mmu_enabled)
return kvm_tdp_mmu_page_fault(vcpu, fault);
@@ -4659,6 +4723,85 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return direct_page_fault(vcpu, fault);
}
+static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
+ u8 *level)
+{
+ int r;
+
+ /*
+ * Restrict to TDP page fault, since that's the only case where the MMU
+ * is indexed by GPA.
+ */
+ if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
+ return -EOPNOTSUPP;
+
+ do {
+ if (signal_pending(current))
+ return -EINTR;
+ cond_resched();
+ r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
+ } while (r == RET_PF_RETRY);
+
+ if (r < 0)
+ return r;
+
+ switch (r) {
+ case RET_PF_FIXED:
+ case RET_PF_SPURIOUS:
+ case RET_PF_WRITE_PROTECTED:
+ return 0;
+
+ case RET_PF_EMULATE:
+ return -ENOENT;
+
+ case RET_PF_RETRY:
+ case RET_PF_CONTINUE:
+ case RET_PF_INVALID:
+ default:
+ WARN_ONCE(1, "could not fix page fault during prefault");
+ return -EIO;
+ }
+}
+
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+ struct kvm_pre_fault_memory *range)
+{
+ u64 error_code = PFERR_GUEST_FINAL_MASK;
+ u8 level = PG_LEVEL_4K;
+ u64 end;
+ int r;
+
+ if (!vcpu->kvm->arch.pre_fault_allowed)
+ return -EOPNOTSUPP;
+
+ /*
+ * reload is efficient when called repeatedly, so we can do it on
+ * every iteration.
+ */
+ r = kvm_mmu_reload(vcpu);
+ if (r)
+ return r;
+
+ if (kvm_arch_has_private_mem(vcpu->kvm) &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
+ /*
+ * Shadow paging uses GVA for kvm page fault, so restrict to
+ * two-dimensional paging.
+ */
+ r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+ if (r < 0)
+ return r;
+
+ /*
+ * If the mapping that covers range->gpa can use a huge page, it
+ * may start below it or end after range->gpa + range->size.
+ */
+ end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
+ return min(range->size, end - range->gpa);
+}
+
static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
@@ -4812,7 +4955,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
unsigned int access)
{
- if (unlikely(is_mmio_spte(*sptep))) {
+ if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
mmu_spte_clear_no_track(sptep);
return true;
@@ -4986,7 +5129,7 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
static inline u64 reserved_hpa_bits(void)
{
- return rsvd_bits(shadow_phys_bits, 63);
+ return rsvd_bits(kvm_host.maxphyaddr, 63);
}
/*
@@ -5322,6 +5465,11 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
return max_tdp_level;
}
+u8 kvm_mmu_get_max_tdp_level(void)
+{
+ return tdp_root_level ? tdp_root_level : max_tdp_level;
+}
+
static union kvm_mmu_page_role
kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
union kvm_cpu_role cpu_role)
@@ -5626,7 +5774,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
* stale entries. Flushing on alloc also allows KVM to skip the TLB
* flush when freeing a root (see kvm_tdp_mmu_put_root()).
*/
- static_call(kvm_x86_flush_tlb_current)(vcpu);
+ kvm_x86_call(flush_tlb_current)(vcpu);
out:
return r;
}
@@ -5802,10 +5950,15 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
bool flush = false;
/*
- * If we don't have indirect shadow pages, it means no page is
- * write-protected, so we can exit simply.
+ * When emulating guest writes, ensure the written value is visible to
+ * any task that is handling page faults before checking whether or not
+ * KVM is shadowing a guest PTE. This ensures either KVM will create
+ * the correct SPTE in the page fault handler, or this task will see
+ * a non-zero indirect_shadow_pages. Pairs with the smp_mb() in
+ * account_shadowed().
*/
- if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+ smp_mb();
+ if (!vcpu->kvm->arch.indirect_shadow_pages)
return;
write_lock(&vcpu->kvm->mmu_lock);
@@ -5840,78 +5993,187 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
write_unlock(&vcpu->kvm->mmu_lock);
}
-int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
- void *insn, int insn_len)
+static bool is_write_to_guest_page_table(u64 error_code)
+{
+ const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK;
+
+ return (error_code & mask) == mask;
+}
+
+static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u64 error_code, int *emulation_type)
{
- int r, emulation_type = EMULTYPE_PF;
bool direct = vcpu->arch.mmu->root_role.direct;
/*
- * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
- * checks when emulating instructions that triggers implicit access.
- * WARN if hardware generates a fault with an error code that collides
- * with the KVM-defined value. Clear the flag and continue on, i.e.
- * don't terminate the VM, as KVM can't possibly be relying on a flag
- * that KVM doesn't know about.
+ * Do not try to unprotect and retry if the vCPU re-faulted on the same
+ * RIP with the same address that was previously unprotected, as doing
+ * so will likely put the vCPU into an infinite. E.g. if the vCPU uses
+ * a non-page-table modifying instruction on the PDE that points to the
+ * instruction, then unprotecting the gfn will unmap the instruction's
+ * code, i.e. make it impossible for the instruction to ever complete.
+ */
+ if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) &&
+ vcpu->arch.last_retry_addr == cr2_or_gpa)
+ return RET_PF_EMULATE;
+
+ /*
+ * Reset the unprotect+retry values that guard against infinite loops.
+ * The values will be refreshed if KVM explicitly unprotects a gfn and
+ * retries, in all other cases it's safe to retry in the future even if
+ * the next page fault happens on the same RIP+address.
+ */
+ vcpu->arch.last_retry_eip = 0;
+ vcpu->arch.last_retry_addr = 0;
+
+ /*
+ * It should be impossible to reach this point with an MMIO cache hit,
+ * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid,
+ * writable memslot, and creating a memslot should invalidate the MMIO
+ * cache by way of changing the memslot generation. WARN and disallow
+ * retry if MMIO is detected, as retrying MMIO emulation is pointless
+ * and could put the vCPU into an infinite loop because the processor
+ * will keep faulting on the non-existent MMIO address.
+ */
+ if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct)))
+ return RET_PF_EMULATE;
+
+ /*
+ * Before emulating the instruction, check to see if the access was due
+ * to a read-only violation while the CPU was walking non-nested NPT
+ * page tables, i.e. for a direct MMU, for _guest_ page tables in L1.
+ * If L1 is sharing (a subset of) its page tables with L2, e.g. by
+ * having nCR3 share lower level page tables with hCR3, then when KVM
+ * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also
+ * unknowingly write-protecting L1's guest page tables, which KVM isn't
+ * shadowing.
+ *
+ * Because the CPU (by default) walks NPT page tables using a write
+ * access (to ensure the CPU can do A/D updates), page walks in L1 can
+ * trigger write faults for the above case even when L1 isn't modifying
+ * PTEs. As a result, KVM will unnecessarily emulate (or at least, try
+ * to emulate) an excessive number of L1 instructions; because L1's MMU
+ * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs
+ * and thus no need to emulate in order to guarantee forward progress.
+ *
+ * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can
+ * proceed without triggering emulation. If one or more shadow pages
+ * was zapped, skip emulation and resume L1 to let it natively execute
+ * the instruction. If no shadow pages were zapped, then the write-
+ * fault is due to something else entirely, i.e. KVM needs to emulate,
+ * as resuming the guest will put it into an infinite loop.
+ *
+ * Note, this code also applies to Intel CPUs, even though it is *very*
+ * unlikely that an L1 will share its page tables (IA32/PAE/paging64
+ * format) with L2's page tables (EPT format).
+ *
+ * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to
+ * unprotect the gfn and retry if an event is awaiting reinjection. If
+ * KVM emulates multiple instructions before completing event injection,
+ * the event could be delayed beyond what is architecturally allowed,
+ * e.g. KVM could inject an IRQ after the TPR has been raised.
+ */
+ if (((direct && is_write_to_guest_page_table(error_code)) ||
+ (!direct && kvm_event_needs_reinjection(vcpu))) &&
+ kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
+ return RET_PF_RETRY;
+
+ /*
+ * The gfn is write-protected, but if KVM detects its emulating an
+ * instruction that is unlikely to be used to modify page tables, or if
+ * emulation fails, KVM can try to unprotect the gfn and let the CPU
+ * re-execute the instruction that caused the page fault. Do not allow
+ * retrying an instruction from a nested guest as KVM is only explicitly
+ * shadowing L1's page tables, i.e. unprotecting something for L1 isn't
+ * going to magically fix whatever issue caused L2 to fail.
*/
- if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
- error_code &= ~PFERR_IMPLICIT_ACCESS;
+ if (!is_guest_mode(vcpu))
+ *emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
+
+ return RET_PF_EMULATE;
+}
+
+int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+ void *insn, int insn_len)
+{
+ int r, emulation_type = EMULTYPE_PF;
+ bool direct = vcpu->arch.mmu->root_role.direct;
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
+ /*
+ * Except for reserved faults (emulated MMIO is shared-only), set the
+ * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's
+ * current attributes, which are the source of truth for such VMs. Note,
+ * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't
+ * currently supported nested virtualization (among many other things)
+ * for software-protected VMs.
+ */
+ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
+ !(error_code & PFERR_RSVD_MASK) &&
+ vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS))
+ return -EFAULT;
+
r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
if (r == RET_PF_EMULATE)
goto emulate;
}
if (r == RET_PF_INVALID) {
- r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
- lower_32_bits(error_code), false,
- &emulation_type);
+ vcpu->stat.pf_taken++;
+
+ r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
+ &emulation_type, NULL);
if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
}
if (r < 0)
return r;
+
+ if (r == RET_PF_WRITE_PROTECTED)
+ r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code,
+ &emulation_type);
+
+ if (r == RET_PF_FIXED)
+ vcpu->stat.pf_fixed++;
+ else if (r == RET_PF_EMULATE)
+ vcpu->stat.pf_emulate++;
+ else if (r == RET_PF_SPURIOUS)
+ vcpu->stat.pf_spurious++;
+
if (r != RET_PF_EMULATE)
return 1;
- /*
- * Before emulating the instruction, check if the error code
- * was due to a RO violation while translating the guest page.
- * This can occur when using nested virtualization with nested
- * paging in both guests. If true, we simply unprotect the page
- * and resume the guest.
- */
- if (vcpu->arch.mmu->root_role.direct &&
- (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
- return 1;
- }
-
- /*
- * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
- * optimistically try to just unprotect the page and let the processor
- * re-execute the instruction that caused the page fault. Do not allow
- * retrying MMIO emulation, as it's not only pointless but could also
- * cause us to enter an infinite loop because the processor will keep
- * faulting on the non-existent MMIO address. Retrying an instruction
- * from a nested guest is also pointless and dangerous as we are only
- * explicitly shadowing L1's page tables, i.e. unprotecting something
- * for L1 isn't going to magically fix whatever issue cause L2 to fail.
- */
- if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
- emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
emulate:
return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
insn_len);
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg)
+{
+ u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
+ int root_level, leaf, level;
+
+ leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level);
+ if (unlikely(leaf < 0))
+ return;
+
+ pr_err("%s %llx", msg, gpa);
+ for (level = root_level; level >= leaf; level--)
+ pr_cont(", spte[%d] = 0x%llx", level, sptes[level]);
+ pr_cont("\n");
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes);
+
static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
u64 addr, hpa_t root_hpa)
{
@@ -5962,7 +6224,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
if (is_noncanonical_address(addr, vcpu))
return;
- static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
+ kvm_x86_call(flush_tlb_gva)(vcpu, addr);
}
if (!mmu->sync_spte)
@@ -6048,59 +6310,6 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
}
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
-/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- const struct kvm_memory_slot *slot);
-
-static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- slot_rmaps_handler fn,
- int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn,
- bool flush_on_yield, bool flush)
-{
- struct slot_rmap_walk_iterator iterator;
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
- end_gfn, &iterator) {
- if (iterator.rmap)
- flush |= fn(kvm, iterator.rmap, slot);
-
- if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- if (flush && flush_on_yield) {
- kvm_flush_remote_tlbs_range(kvm, start_gfn,
- iterator.gfn - start_gfn + 1);
- flush = false;
- }
- cond_resched_rwlock_write(&kvm->mmu_lock);
- }
- }
-
- return flush;
-}
-
-static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- slot_rmaps_handler fn,
- int start_level, int end_level,
- bool flush_on_yield)
-{
- return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
- slot->base_gfn, slot->base_gfn + slot->npages - 1,
- flush_on_yield, false);
-}
-
-static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- slot_rmaps_handler fn,
- bool flush_on_yield)
-{
- return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
-}
-
static void free_mmu_pages(struct kvm_mmu *mmu)
{
if (!tdp_enabled && mmu->pae_root)
@@ -6173,7 +6382,10 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
- vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+ vcpu->arch.mmu_shadow_page_cache.init_value =
+ SHADOW_NONPRESENT_VALUE;
+ if (!vcpu->arch.mmu_shadow_page_cache.init_value)
+ vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
@@ -6316,6 +6528,7 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
void kvm_mmu_init_vm(struct kvm *kvm)
{
+ kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
@@ -6370,9 +6583,8 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
if (WARN_ON_ONCE(start >= end))
continue;
- flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
- PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true, flush);
+ flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start,
+ end, true, flush);
}
}
@@ -6660,7 +6872,7 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
*/
for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
__walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
- level, level, start, end - 1, true, false);
+ level, level, start, end - 1, true, true, false);
}
/* Must be called with the mmu_lock held in write-mode. */
@@ -6750,6 +6962,7 @@ restart:
return need_tlb_flush;
}
+EXPORT_SYMBOL_GPL(kvm_zap_gfn_range);
static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
@@ -6838,10 +7051,70 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
kvm_mmu_zap_all(kvm);
}
+static void kvm_mmu_zap_memslot_pages_and_flush(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ bool flush)
+{
+ LIST_HEAD(invalid_list);
+ unsigned long i;
+
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ goto out_flush;
+
+ /*
+ * Since accounting information is stored in struct kvm_arch_memory_slot,
+ * shadow pages deletion (e.g. unaccount_shadowed()) requires that all
+ * gfns with a shadow page have a corresponding memslot. Do so before
+ * the memslot goes away.
+ */
+ for (i = 0; i < slot->npages; i++) {
+ struct kvm_mmu_page *sp;
+ gfn_t gfn = slot->base_gfn + i;
+
+ for_each_gfn_valid_sp(kvm, sp, gfn)
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ flush = false;
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ }
+ }
+
+out_flush:
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+}
+
+static void kvm_mmu_zap_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ struct kvm_gfn_range range = {
+ .slot = slot,
+ .start = slot->base_gfn,
+ .end = slot->base_gfn + slot->npages,
+ .may_block = true,
+ };
+ bool flush;
+
+ write_lock(&kvm->mmu_lock);
+ flush = kvm_unmap_gfn_range(kvm, &range);
+ kvm_mmu_zap_memslot_pages_and_flush(kvm, slot, flush);
+ write_unlock(&kvm->mmu_lock);
+}
+
+static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_DEFAULT_VM &&
+ kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL);
+}
+
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
- kvm_mmu_zap_all_fast(kvm);
+ if (kvm_memslot_flush_zap_all(kvm))
+ kvm_mmu_zap_all_fast(kvm);
+ else
+ kvm_mmu_zap_memslot(kvm, slot);
}
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
@@ -6880,7 +7153,6 @@ static unsigned long mmu_shrink_scan(struct shrinker *shrink,
list_for_each_entry(kvm, &vm_list, vm_list) {
int idx;
- LIST_HEAD(invalid_list);
/*
* Never scan more than sc->nr_to_scan VM instances.
@@ -7355,7 +7627,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
if (level == PG_LEVEL_2M)
- return kvm_range_has_memory_attributes(kvm, start, end, attrs);
+ return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
if (hugepage_test_mixed(slot, gfn, level - 1) ||
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 5390a591a571..c98827840e07 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -190,7 +190,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
struct kvm_page_fault {
/* arguments to kvm_mmu_do_page_fault. */
const gpa_t addr;
- const u32 error_code;
+ const u64 error_code;
const bool prefetch;
/* Derived from error_code. */
@@ -258,6 +258,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
* RET_PF_CONTINUE: So far, so good, keep handling the page fault.
* RET_PF_RETRY: let CPU fault again on the address.
* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the
+ * gfn and retry, or emulate the instruction directly.
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
* RET_PF_FIXED: The faulting entry has been fixed.
* RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
@@ -274,13 +276,23 @@ enum {
RET_PF_CONTINUE = 0,
RET_PF_RETRY,
RET_PF_EMULATE,
+ RET_PF_WRITE_PROTECTED,
RET_PF_INVALID,
RET_PF_FIXED,
RET_PF_SPURIOUS,
};
+static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
+ PAGE_SIZE, fault->write, fault->exec,
+ fault->is_private);
+}
+
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 err, bool prefetch, int *emulation_type)
+ u64 err, bool prefetch,
+ int *emulation_type, u8 *level)
{
struct kvm_page_fault fault = {
.addr = cr2_or_gpa,
@@ -298,7 +310,10 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
.max_level = KVM_MAX_HUGEPAGE_LEVEL,
.req_level = PG_LEVEL_4K,
.goal_level = PG_LEVEL_4K,
- .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
+ .is_private = err & PFERR_PRIVATE_ACCESS,
+
+ .pfn = KVM_PFN_ERR_FAULT,
+ .hva = KVM_HVA_ERR_BAD,
};
int r;
@@ -307,35 +322,27 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
}
- /*
- * Async #PF "faults", a.k.a. prefetch faults, are not faults from the
- * guest perspective and have already been counted at the time of the
- * original fault.
- */
- if (!prefetch)
- vcpu->stat.pf_taken++;
-
if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
r = kvm_tdp_page_fault(vcpu, &fault);
else
r = vcpu->arch.mmu->page_fault(vcpu, &fault);
+ /*
+ * Not sure what's happening, but punt to userspace and hope that
+ * they can fix it by changing memory to shared, or they can
+ * provide a better error.
+ */
+ if (r == RET_PF_EMULATE && fault.is_private) {
+ pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
+ kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
+ return -EFAULT;
+ }
+
if (fault.write_fault_to_shadow_pgtable && emulation_type)
*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
+ if (level)
+ *level = fault.goal_level;
- /*
- * Similar to above, prefetch faults aren't truly spurious, and the
- * async #PF path doesn't do emulation. Do count faults that are fixed
- * by the async #PF handler though, otherwise they'll never be counted.
- */
- if (r == RET_PF_FIXED)
- vcpu->stat.pf_fixed++;
- else if (prefetch)
- ;
- else if (r == RET_PF_EMULATE)
- vcpu->stat.pf_emulate++;
- else if (r == RET_PF_SPURIOUS)
- vcpu->stat.pf_spurious++;
return r;
}
@@ -345,8 +352,6 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
-void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
-
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index ae86820cef69..f35a830ce469 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -57,6 +57,7 @@
TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
TRACE_DEFINE_ENUM(RET_PF_RETRY);
TRACE_DEFINE_ENUM(RET_PF_EMULATE);
+TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED);
TRACE_DEFINE_ENUM(RET_PF_INVALID);
TRACE_DEFINE_ENUM(RET_PF_FIXED);
TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
@@ -260,7 +261,7 @@ TRACE_EVENT(
TP_STRUCT__entry(
__field(int, vcpu_id)
__field(gpa_t, cr2_or_gpa)
- __field(u32, error_code)
+ __field(u64, error_code)
__field(u64 *, sptep)
__field(u64, old_spte)
__field(u64, new_spte)
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index f6448284c18e..561c331fd6ec 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -41,7 +41,7 @@ bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
{
- kvfree(slot->arch.gfn_write_track);
+ vfree(slot->arch.gfn_write_track);
slot->arch.gfn_write_track = NULL;
}
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 4d4e98fe4f35..ae7d39ff2d07 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -497,21 +497,21 @@ error:
* The other bits are set to 0.
*/
if (!(errcode & PFERR_RSVD_MASK)) {
- vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID |
- EPT_VIOLATION_GVA_TRANSLATED);
+ walker->fault.exit_qualification = 0;
+
if (write_fault)
- vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
if (user_fault)
- vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
if (fetch_fault)
- vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
/*
* Note, pte_access holds the raw RWX bits from the EPTE, not
* ACC_*_MASK flags!
*/
- vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
- EPT_VIOLATION_RWX_SHIFT;
+ walker->fault.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
+ EPT_VIOLATION_RWX_SHIFT;
}
#endif
walker->fault.address = addr;
@@ -646,10 +646,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
* really care if it changes underneath us after this point).
*/
if (FNAME(gpte_changed)(vcpu, gw, top_level))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
/*
* Load a new root and retry the faulting instruction in the extremely
@@ -659,7 +659,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
*/
if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu);
- goto out_gpte_changed;
+ return RET_PF_RETRY;
}
for_each_shadow_entry(vcpu, fault->addr, it) {
@@ -674,34 +674,38 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
false, access);
- if (sp != ERR_PTR(-EEXIST)) {
- /*
- * We must synchronize the pagetable before linking it
- * because the guest doesn't need to flush tlb when
- * the gpte is changed from non-present to present.
- * Otherwise, the guest may use the wrong mapping.
- *
- * For PG_LEVEL_4K, kvm_mmu_get_page() has already
- * synchronized it transiently via kvm_sync_page().
- *
- * For higher level pagetable, we synchronize it via
- * the slower mmu_sync_children(). If it needs to
- * break, some progress has been made; return
- * RET_PF_RETRY and retry on the next #PF.
- * KVM_REQ_MMU_SYNC is not necessary but it
- * expedites the process.
- */
- if (sp->unsync_children &&
- mmu_sync_children(vcpu, sp, false))
- return RET_PF_RETRY;
- }
+ /*
+ * Synchronize the new page before linking it, as the CPU (KVM)
+ * is architecturally disallowed from inserting non-present
+ * entries into the TLB, i.e. the guest isn't required to flush
+ * the TLB when changing the gPTE from non-present to present.
+ *
+ * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already
+ * synchronized the page via kvm_sync_page().
+ *
+ * For higher level pages, which cannot be unsync themselves
+ * but can have unsync children, synchronize via the slower
+ * mmu_sync_children(). If KVM needs to drop mmu_lock due to
+ * contention or to reschedule, instruct the caller to retry
+ * the #PF (mmu_sync_children() ensures forward progress will
+ * be made).
+ */
+ if (sp != ERR_PTR(-EEXIST) && sp->unsync_children &&
+ mmu_sync_children(vcpu, sp, false))
+ return RET_PF_RETRY;
/*
- * Verify that the gpte in the page we've just write
- * protected is still there.
+ * Verify that the gpte in the page, which is now either
+ * write-protected or unsync, wasn't modified between the fault
+ * and acquiring mmu_lock. This needs to be done even when
+ * reusing an existing shadow page to ensure the information
+ * gathered by the walker matches the information stored in the
+ * shadow page (which could have been modified by a different
+ * vCPU even if the page was already linked). Holding mmu_lock
+ * prevents the shadow page from changing after this point.
*/
if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
if (sp != ERR_PTR(-EEXIST))
link_shadow_page(vcpu, it.sptep, sp);
@@ -755,9 +759,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return ret;
-
-out_gpte_changed:
- return RET_PF_RETRY;
}
/*
@@ -805,7 +806,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (page_fault_handle_page_track(vcpu, fault)) {
shadow_page_table_clear_flood(vcpu, fault->addr);
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
}
r = mmu_topup_memory_caches(vcpu, true);
@@ -911,7 +912,8 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
gpa_t pte_gpa;
gfn_t gfn;
- if (WARN_ON_ONCE(!sp->spt[i]))
+ if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE ||
+ !sp->shadowed_translation))
return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
@@ -933,13 +935,13 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
return 0;
/*
- * Drop the SPTE if the new protections would result in a RWX=0
- * SPTE or if the gfn is changing. The RWX=0 case only affects
- * EPT with execute-only support, i.e. EPT without an effective
- * "present" bit, as all other paging modes will create a
- * read-only SPTE if pte_access is zero.
+ * Drop the SPTE if the new protections result in no effective
+ * "present" bit or if the gfn is changing. The former case
+ * only affects EPT with execute-only support with pte_access==0;
+ * all other paging modes will create a read-only SPTE if
+ * pte_access is zero.
*/
- if ((!pte_access && !shadow_present_mask) ||
+ if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE ||
gfn != kvm_mmu_page_get_gfn(sp, i)) {
drop_spte(vcpu->kvm, &sp->spt[i]);
return 1;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 4a599130e9c9..8f7eb3ad88fc 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -43,7 +43,25 @@ u64 __read_mostly shadow_acc_track_mask;
u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
-u8 __read_mostly shadow_phys_bits;
+static u8 __init kvm_get_host_maxphyaddr(void)
+{
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+ * in CPU detection code, but the processor treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+ * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR,
+ * when reasoning about CPU behavior with respect to MAXPHYADDR.
+ */
+ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+ return cpuid_eax(0x80000008) & 0xff;
+
+ /*
+ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+ * custom CPUID. Proceed with whatever the kernel found since these features
+ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+ */
+ return boot_cpu_data.x86_phys_bits;
+}
void __init kvm_mmu_spte_module_init(void)
{
@@ -55,6 +73,8 @@ void __init kvm_mmu_spte_module_init(void)
* will change when the vendor module is (re)loaded.
*/
allow_mmio_caching = enable_mmio_caching;
+
+ kvm_host.maxphyaddr = kvm_get_host_maxphyaddr();
}
static u64 generation_mmio_spte_mask(u64 gen)
@@ -74,10 +94,10 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
u64 spte = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
- WARN_ON_ONCE(!shadow_mmio_value);
+ WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value);
access &= shadow_mmio_access_mask;
- spte |= shadow_mmio_value | access;
+ spte |= vcpu->kvm->arch.shadow_mmio_value | access;
spte |= gpa | shadow_nonpresent_or_rsvd_mask;
spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
@@ -144,19 +164,19 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 spte = SPTE_MMU_PRESENT_MASK;
bool wrprot = false;
- WARN_ON_ONCE(!pte_access && !shadow_present_mask);
+ /*
+ * For the EPT case, shadow_present_mask has no RWX bits set if
+ * exec-only page table entries are supported. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE);
if (sp->role.ad_disabled)
spte |= SPTE_TDP_AD_DISABLED;
else if (kvm_mmu_page_ad_need_write_protect(sp))
spte |= SPTE_TDP_AD_WRPROT_ONLY;
- /*
- * For the EPT case, shadow_present_mask is 0 if hardware
- * supports exec-only page table entries. In that case,
- * ACC_USER_MASK and shadow_user_mask are used to represent
- * read access. See FNAME(gpte_access) in paging_tmpl.h.
- */
spte |= shadow_present_mask;
if (!prefetch)
spte |= spte_shadow_accessed_mask(spte);
@@ -190,8 +210,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
spte |= PT_PAGE_SIZE_MASK;
if (shadow_memtype_mask)
- spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
+ spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn));
if (host_writable)
spte |= shadow_host_writable_mask;
else
@@ -271,18 +291,12 @@ static u64 make_spte_executable(u64 spte)
* This is used during huge page splitting to build the SPTEs that make up the
* new page table.
*/
-u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role,
- int index)
+u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte,
+ union kvm_mmu_page_role role, int index)
{
- u64 child_spte;
-
- if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte)))
- return 0;
+ u64 child_spte = huge_spte;
- if (WARN_ON_ONCE(!is_large_pte(huge_spte)))
- return 0;
-
- child_spte = huge_spte;
+ KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm);
/*
* The child_spte already has the base address of the huge page being
@@ -322,22 +336,6 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
return spte;
}
-u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
-{
- u64 new_spte;
-
- new_spte = old_spte & ~SPTE_BASE_ADDR_MASK;
- new_spte |= (u64)new_pfn << PAGE_SHIFT;
-
- new_spte &= ~PT_WRITABLE_MASK;
- new_spte &= ~shadow_host_writable_mask;
- new_spte &= ~shadow_mmu_writable_mask;
-
- new_spte = mark_spte_for_access_track(new_spte);
-
- return new_spte;
-}
-
u64 mark_spte_for_access_track(u64 spte)
{
if (spte_ad_enabled(spte))
@@ -393,13 +391,13 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
mmio_value = 0;
/*
- * The masked MMIO value must obviously match itself and a removed SPTE
- * must not get a false positive. Removed SPTEs and MMIO SPTEs should
- * never collide as MMIO must set some RWX bits, and removed SPTEs must
+ * The masked MMIO value must obviously match itself and a frozen SPTE
+ * must not get a false positive. Frozen SPTEs and MMIO SPTEs should
+ * never collide as MMIO must set some RWX bits, and frozen SPTEs must
* not set any RWX bits.
*/
if (WARN_ON((mmio_value & mmio_mask) != mmio_value) ||
- WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
+ WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value))
mmio_value = 0;
if (!mmio_value)
@@ -429,7 +427,9 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
shadow_nx_mask = 0ull;
shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
- shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+ /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
+ shadow_present_mask =
+ (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
/*
* EPT overrides the host MTRRs, and so KVM must program the desired
* memtype directly into the SPTEs. Note, this mask is just the mask
@@ -446,7 +446,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
* of an EPT paging-structure entry is 110b (write/execute).
*/
kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
- VMX_EPT_RWX_MASK, 0);
+ VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0);
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
@@ -455,8 +455,6 @@ void kvm_mmu_reset_all_pte_masks(void)
u8 low_phys_bits;
u64 mask;
- shadow_phys_bits = kvm_get_shadow_phys_bits();
-
/*
* If the CPU has 46 or less physical address bits, then set an
* appropriate mask to guard against L1TF attacks. Otherwise, it is
@@ -508,7 +506,7 @@ void kvm_mmu_reset_all_pte_masks(void)
* 52-bit physical addresses then there are no reserved PA bits in the
* PTEs and so the reserved PA approach must be disabled.
*/
- if (shadow_phys_bits < 52)
+ if (kvm_host.maxphyaddr < 52)
mask = BIT_ULL(51) | PT_PRESENT_MASK;
else
mask = 0;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index a129951c9a88..2cb816ea2430 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -3,6 +3,8 @@
#ifndef KVM_X86_MMU_SPTE_H
#define KVM_X86_MMU_SPTE_H
+#include <asm/vmx.h>
+
#include "mmu.h"
#include "mmu_internal.h"
@@ -149,6 +151,22 @@ static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+/*
+ * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress
+ * #VE and get EPT violations on non-present PTEs. We can use the
+ * same value also without TDX for both VMX and SVM:
+ *
+ * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored.
+ * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0)
+ * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1)
+ */
+#ifdef CONFIG_X86_64
+#define SHADOW_NONPRESENT_VALUE BIT_ULL(63)
+static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK));
+#else
+#define SHADOW_NONPRESENT_VALUE 0ULL
+#endif
+
extern u64 __read_mostly shadow_host_writable_mask;
extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
@@ -184,24 +202,24 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
/*
* If a thread running without exclusive control of the MMU lock must perform a
- * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
+ * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a
* non-present intermediate value. Other threads which encounter this value
* should not modify the SPTE.
*
* Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
* both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
- * vulnerability. Use only low bits to avoid 64-bit immediates.
+ * vulnerability.
*
* Only used by the TDP MMU.
*/
-#define REMOVED_SPTE 0x5a0ULL
+#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
-/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
-static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
+/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */
+static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK));
-static inline bool is_removed_spte(u64 spte)
+static inline bool is_frozen_spte(u64 spte)
{
- return spte == REMOVED_SPTE;
+ return spte == FROZEN_SPTE;
}
/* Get an SPTE's index into its parent's page table (and the spt array). */
@@ -249,9 +267,9 @@ static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
return spte_to_child_sp(root);
}
-static inline bool is_mmio_spte(u64 spte)
+static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
{
- return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+ return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
likely(enable_mmio_caching);
}
@@ -260,6 +278,13 @@ static inline bool is_shadow_present_pte(u64 pte)
return !!(pte & SPTE_MMU_PRESENT_MASK);
}
+static inline bool is_ept_ve_possible(u64 spte)
+{
+ return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) &&
+ !(spte & VMX_EPT_SUPPRESS_VE_BIT) &&
+ (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE;
+}
+
/*
* Returns true if A/D bits are supported in hardware and are enabled by KVM.
* When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can
@@ -496,8 +521,6 @@ static inline u64 restore_acc_track_spte(u64 spte)
return spte;
}
-u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn);
-
void __init kvm_mmu_spte_module_init(void);
void kvm_mmu_reset_all_pte_masks(void);
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index fae559559a80..2880fd392e0c 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -21,11 +21,13 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
return xchg(rcu_dereference(sptep), new_spte);
}
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
{
+ KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 04c1f0957fea..3b996c1fdaab 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -359,14 +359,14 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
/*
* Set the SPTE to a nonpresent value that other
* threads will not overwrite. If the SPTE was
- * already marked as removed then another thread
+ * already marked as frozen then another thread
* handling a page fault could overwrite it, so
* set the SPTE until it is set from some other
- * value to the removed SPTE value.
+ * value to the frozen SPTE value.
*/
for (;;) {
- old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
- if (!is_removed_spte(old_spte))
+ old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
+ if (!is_frozen_spte(old_spte))
break;
cpu_relax();
}
@@ -397,11 +397,11 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* No retry is needed in the atomic update path as the
* sole concern is dropping a Dirty bit, i.e. no other
* task can zap/remove the SPTE as mmu_lock is held for
- * write. Marking the SPTE as a removed SPTE is not
+ * write. Marking the SPTE as a frozen SPTE is not
* strictly necessary for the same reason, but using
- * the remove SPTE value keeps the shared/exclusive
+ * the frozen SPTE value keeps the shared/exclusive
* paths consistent and allows the handle_changed_spte()
- * call below to hardcode the new value to REMOVED_SPTE.
+ * call below to hardcode the new value to FROZEN_SPTE.
*
* Note, even though dropping a Dirty bit is the only
* scenario where a non-atomic update could result in a
@@ -413,10 +413,10 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* it here.
*/
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
- REMOVED_SPTE, level);
+ FROZEN_SPTE, level);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_spte, REMOVED_SPTE, level, shared);
+ old_spte, FROZEN_SPTE, level, shared);
}
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -490,19 +490,19 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
*/
if (!was_present && !is_present) {
/*
- * If this change does not involve a MMIO SPTE or removed SPTE,
+ * If this change does not involve a MMIO SPTE or frozen SPTE,
* it is unexpected. Log the change, though it should not
* impact the guest since both the former and current SPTEs
* are nonpresent.
*/
- if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
- !is_mmio_spte(new_spte) &&
- !is_removed_spte(new_spte)))
+ if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
+ !is_mmio_spte(kvm, new_spte) &&
+ !is_frozen_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
"different nonpresent SPTE, unless one or both\n"
"are MMIO SPTEs, or the new SPTE is\n"
- "a temporary removed SPTE.\n"
+ "a temporary frozen SPTE.\n"
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
as_id, gfn, old_spte, new_spte, level);
return;
@@ -530,6 +530,32 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}
+static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter,
+ u64 new_spte)
+{
+ u64 *sptep = rcu_dereference(iter->sptep);
+
+ /*
+ * The caller is responsible for ensuring the old SPTE is not a FROZEN
+ * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
+ * and pre-checking before inserting a new SPTE is advantageous as it
+ * avoids unnecessary work.
+ */
+ WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
+
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
+ * does not hold the mmu_lock. On failure, i.e. if a different logical
+ * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
+ * the current value, so the caller operates on fresh data, e.g. if it
+ * retries tdp_mmu_set_spte_atomic()
+ */
+ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
+ return -EBUSY;
+
+ return 0;
+}
+
/*
* tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
* and handle the associated bookkeeping. Do not mark the page dirty
@@ -547,31 +573,17 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* no side-effects other than setting iter->old_spte to the last
* known value of the spte.
*/
-static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
- u64 *sptep = rcu_dereference(iter->sptep);
-
- /*
- * The caller is responsible for ensuring the old SPTE is not a REMOVED
- * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE,
- * and pre-checking before inserting a new SPTE is advantageous as it
- * avoids unnecessary work.
- */
- WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
+ int ret;
lockdep_assert_held_read(&kvm->mmu_lock);
- /*
- * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
- * does not hold the mmu_lock. On failure, i.e. if a different logical
- * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
- * the current value, so the caller operates on fresh data, e.g. if it
- * retries tdp_mmu_set_spte_atomic()
- */
- if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
- return -EBUSY;
+ ret = __tdp_mmu_set_spte_atomic(iter, new_spte);
+ if (ret)
+ return ret;
handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
new_spte, iter->level, true);
@@ -579,31 +591,43 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
return 0;
}
-static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter)
+static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter)
{
int ret;
+ lockdep_assert_held_read(&kvm->mmu_lock);
+
/*
- * Freeze the SPTE by setting it to a special,
- * non-present value. This will stop other threads from
- * immediately installing a present entry in its place
- * before the TLBs are flushed.
+ * Freeze the SPTE by setting it to a special, non-present value. This
+ * will stop other threads from immediately installing a present entry
+ * in its place before the TLBs are flushed.
+ *
+ * Delay processing of the zapped SPTE until after TLBs are flushed and
+ * the FROZEN_SPTE is replaced (see below).
*/
- ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
+ ret = __tdp_mmu_set_spte_atomic(iter, FROZEN_SPTE);
if (ret)
return ret;
kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
/*
- * No other thread can overwrite the removed SPTE as they must either
+ * No other thread can overwrite the frozen SPTE as they must either
* wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
- * overwrite the special removed SPTE value. No bookkeeping is needed
- * here since the SPTE is going from non-present to non-present. Use
- * the raw write helper to avoid an unnecessary check on volatile bits.
+ * overwrite the special frozen SPTE value. Use the raw write helper to
+ * avoid an unnecessary check on volatile bits.
*/
- __kvm_tdp_mmu_write_spte(iter->sptep, 0);
+ __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
+
+ /*
+ * Process the zapped SPTE after flushing TLBs, and after replacing
+ * FROZEN_SPTE with 0. This minimizes the amount of time vCPUs are
+ * blocked by the FROZEN_SPTE and reduces contention on the child
+ * SPTEs.
+ */
+ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ SHADOW_NONPRESENT_VALUE, iter->level, true);
return 0;
}
@@ -629,12 +653,12 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
/*
* No thread should be using this function to set SPTEs to or from the
- * temporary removed SPTE value.
+ * temporary frozen SPTE value.
* If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
* should be used. If operating under the MMU lock in write mode, the
- * use of the removed SPTE should not be necessary.
+ * use of the frozen SPTE should not be necessary.
*/
- WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
+ WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
@@ -740,8 +764,8 @@ retry:
continue;
if (!shared)
- tdp_mmu_iter_set_spte(kvm, &iter, 0);
- else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
+ else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
goto retry;
}
}
@@ -808,8 +832,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
return false;
- tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
- sp->gfn, sp->role.level + 1);
+ tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
+ SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
return true;
}
@@ -843,7 +867,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- tdp_mmu_iter_set_spte(kvm, &iter, 0);
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
/*
* Zappings SPTEs in invalid roots doesn't require a TLB flush,
@@ -1022,13 +1046,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
* protected, emulation is needed. If the emulation was skipped,
* the vCPU would have the same fault again.
*/
- if (wrprot) {
- if (fault->write)
- ret = RET_PF_EMULATE;
- }
+ if (wrprot && fault->write)
+ ret = RET_PF_WRITE_PROTECTED;
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
- if (unlikely(is_mmio_spte(new_spte))) {
+ if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
vcpu->stat.pf_mmio_spte_created++;
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
@@ -1103,7 +1125,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* If SPTE has been frozen by another thread, just give up and
* retry, avoiding unnecessary page table allocation and free.
*/
- if (is_removed_spte(iter.old_spte))
+ if (is_frozen_spte(iter.old_spte))
goto retry;
if (iter.level == fault->goal_level)
@@ -1258,52 +1280,6 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
}
-static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
- struct kvm_gfn_range *range)
-{
- u64 new_spte;
-
- /* Huge pages aren't expected to be modified without first being zapped. */
- WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
-
- if (iter->level != PG_LEVEL_4K ||
- !is_shadow_present_pte(iter->old_spte))
- return false;
-
- /*
- * Note, when changing a read-only SPTE, it's not strictly necessary to
- * zero the SPTE before setting the new PFN, but doing so preserves the
- * invariant that the PFN of a present * leaf SPTE can never change.
- * See handle_changed_spte().
- */
- tdp_mmu_iter_set_spte(kvm, iter, 0);
-
- if (!pte_write(range->arg.pte)) {
- new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
- pte_pfn(range->arg.pte));
-
- tdp_mmu_iter_set_spte(kvm, iter, new_spte);
- }
-
- return true;
-}
-
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- /*
- * No need to handle the remote TLB flush under RCU protection, the
- * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
- * shadow page. See the WARN on pfn_changed in handle_changed_spte().
- */
- return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
-}
-
/*
* Remove write access from all SPTEs at or above min_level that map GFNs
* [start, end). Returns true if an SPTE has been changed and the TLBs need to
@@ -1362,17 +1338,15 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
return spte_set;
}
-static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
{
struct kvm_mmu_page *sp;
- gfp |= __GFP_ZERO;
-
- sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
+ sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
if (!sp)
return NULL;
- sp->spt = (void *)__get_free_page(gfp);
+ sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!sp->spt) {
kmem_cache_free(mmu_page_header_cache, sp);
return NULL;
@@ -1381,47 +1355,6 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
return sp;
}
-static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
- struct tdp_iter *iter,
- bool shared)
-{
- struct kvm_mmu_page *sp;
-
- kvm_lockdep_assert_mmu_lock_held(kvm, shared);
-
- /*
- * Since we are allocating while under the MMU lock we have to be
- * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
- * reclaim and to avoid making any filesystem callbacks (which can end
- * up invoking KVM MMU notifiers, resulting in a deadlock).
- *
- * If this allocation fails we drop the lock and retry with reclaim
- * allowed.
- */
- sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
- if (sp)
- return sp;
-
- rcu_read_unlock();
-
- if (shared)
- read_unlock(&kvm->mmu_lock);
- else
- write_unlock(&kvm->mmu_lock);
-
- iter->yielded = true;
- sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
-
- if (shared)
- read_lock(&kvm->mmu_lock);
- else
- write_lock(&kvm->mmu_lock);
-
- rcu_read_lock();
-
- return sp;
-}
-
/* Note, the caller is responsible for initializing @sp. */
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared)
@@ -1468,7 +1401,6 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
{
struct kvm_mmu_page *sp = NULL;
struct tdp_iter iter;
- int ret = 0;
rcu_read_lock();
@@ -1492,17 +1424,31 @@ retry:
continue;
if (!sp) {
- sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
+ rcu_read_unlock();
+
+ if (shared)
+ read_unlock(&kvm->mmu_lock);
+ else
+ write_unlock(&kvm->mmu_lock);
+
+ sp = tdp_mmu_alloc_sp_for_split();
+
+ if (shared)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
+
if (!sp) {
- ret = -ENOMEM;
trace_kvm_mmu_split_huge_page(iter.gfn,
iter.old_spte,
- iter.level, ret);
- break;
+ iter.level, -ENOMEM);
+ return -ENOMEM;
}
- if (iter.yielded)
- continue;
+ rcu_read_lock();
+
+ iter.yielded = true;
+ continue;
}
tdp_mmu_init_child_sp(sp, &iter);
@@ -1523,7 +1469,7 @@ retry:
if (sp)
tdp_mmu_free_sp(sp);
- return ret;
+ return 0;
}
@@ -1824,12 +1770,11 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
*
* WARNING: This function is only intended to be called during fast_page_fault.
*/
-u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
u64 *spte)
{
struct tdp_iter iter;
struct kvm_mmu *mmu = vcpu->arch.mmu;
- gfn_t gfn = addr >> PAGE_SHIFT;
tdp_ptep_t sptep = NULL;
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 6e1ea04ca885..1b74e058a81c 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -31,7 +31,6 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush);
bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
-bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
const struct kvm_memory_slot *slot, int min_level);
@@ -65,7 +64,7 @@ static inline void kvm_tdp_mmu_walk_lockless_end(void)
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level);
-u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
u64 *spte);
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index a67c28a56417..05490b9d8a43 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -19,33 +19,21 @@
#include <asm/mtrr.h>
#include "cpuid.h"
-#include "mmu.h"
-#define IA32_MTRR_DEF_TYPE_E (1ULL << 11)
-#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10)
-#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff)
-
-static bool is_mtrr_base_msr(unsigned int msr)
-{
- /* MTRR base MSRs use even numbers, masks use odd numbers. */
- return !(msr & 0x1);
-}
-
-static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu,
- unsigned int msr)
+static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr)
{
- int index = (msr - MTRRphysBase_MSR(0)) / 2;
-
- return &vcpu->arch.mtrr_state.var_ranges[index];
-}
+ int index;
-static bool msr_mtrr_valid(unsigned msr)
-{
switch (msr) {
case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1):
+ index = msr - MTRRphysBase_MSR(0);
+ return &vcpu->arch.mtrr_state.var[index];
case MSR_MTRRfix64K_00000:
+ return &vcpu->arch.mtrr_state.fixed_64k;
case MSR_MTRRfix16K_80000:
case MSR_MTRRfix16K_A0000:
+ index = msr - MSR_MTRRfix16K_80000;
+ return &vcpu->arch.mtrr_state.fixed_16k[index];
case MSR_MTRRfix4K_C0000:
case MSR_MTRRfix4K_C8000:
case MSR_MTRRfix4K_D0000:
@@ -54,10 +42,14 @@ static bool msr_mtrr_valid(unsigned msr)
case MSR_MTRRfix4K_E8000:
case MSR_MTRRfix4K_F0000:
case MSR_MTRRfix4K_F8000:
+ index = msr - MSR_MTRRfix4K_C0000;
+ return &vcpu->arch.mtrr_state.fixed_4k[index];
case MSR_MTRRdefType:
- return true;
+ return &vcpu->arch.mtrr_state.deftype;
+ default:
+ break;
}
- return false;
+ return NULL;
}
static bool valid_mtrr_type(unsigned t)
@@ -70,9 +62,6 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
int i;
u64 mask;
- if (!msr_mtrr_valid(msr))
- return false;
-
if (msr == MSR_MTRRdefType) {
if (data & ~0xcff)
return false;
@@ -85,8 +74,9 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
/* variable MTRRs */
- WARN_ON(!(msr >= MTRRphysBase_MSR(0) &&
- msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1)));
+ if (WARN_ON_ONCE(!(msr >= MTRRphysBase_MSR(0) &&
+ msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))))
+ return false;
mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
if ((msr & 1) == 0) {
@@ -94,309 +84,32 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!valid_mtrr_type(data & 0xff))
return false;
mask |= 0xf00;
- } else
+ } else {
/* MTRR mask */
mask |= 0x7ff;
-
- return (data & mask) == 0;
-}
-
-static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
-{
- return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
-}
-
-static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
-{
- return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
-}
-
-static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
-{
- return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
-}
-
-static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
-{
- /*
- * Intel SDM 11.11.2.2: all MTRRs are disabled when
- * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
- * memory type is applied to all of physical memory.
- *
- * However, virtual machines can be run with CPUID such that
- * there are no MTRRs. In that case, the firmware will never
- * enable MTRRs and it is obviously undesirable to run the
- * guest entirely with UC memory and we use WB.
- */
- if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR))
- return MTRR_TYPE_UNCACHABLE;
- else
- return MTRR_TYPE_WRBACK;
-}
-
-/*
-* Three terms are used in the following code:
-* - segment, it indicates the address segments covered by fixed MTRRs.
-* - unit, it corresponds to the MSR entry in the segment.
-* - range, a range is covered in one memory cache type.
-*/
-struct fixed_mtrr_segment {
- u64 start;
- u64 end;
-
- int range_shift;
-
- /* the start position in kvm_mtrr.fixed_ranges[]. */
- int range_start;
-};
-
-static struct fixed_mtrr_segment fixed_seg_table[] = {
- /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
- {
- .start = 0x0,
- .end = 0x80000,
- .range_shift = 16, /* 64K */
- .range_start = 0,
- },
-
- /*
- * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
- * 16K fixed mtrr.
- */
- {
- .start = 0x80000,
- .end = 0xc0000,
- .range_shift = 14, /* 16K */
- .range_start = 8,
- },
-
- /*
- * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
- * 4K fixed mtrr.
- */
- {
- .start = 0xc0000,
- .end = 0x100000,
- .range_shift = 12, /* 12K */
- .range_start = 24,
- }
-};
-
-/*
- * The size of unit is covered in one MSR, one MSR entry contains
- * 8 ranges so that unit size is always 8 * 2^range_shift.
- */
-static u64 fixed_mtrr_seg_unit_size(int seg)
-{
- return 8 << fixed_seg_table[seg].range_shift;
-}
-
-static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
-{
- switch (msr) {
- case MSR_MTRRfix64K_00000:
- *seg = 0;
- *unit = 0;
- break;
- case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
- *seg = 1;
- *unit = array_index_nospec(
- msr - MSR_MTRRfix16K_80000,
- MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
- break;
- case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
- *seg = 2;
- *unit = array_index_nospec(
- msr - MSR_MTRRfix4K_C0000,
- MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
- break;
- default:
- return false;
}
- return true;
-}
-
-static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
- u64 unit_size = fixed_mtrr_seg_unit_size(seg);
-
- *start = mtrr_seg->start + unit * unit_size;
- *end = *start + unit_size;
- WARN_ON(*end > mtrr_seg->end);
-}
-
-static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
-
- WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
- > mtrr_seg->end);
-
- /* each unit has 8 ranges. */
- return mtrr_seg->range_start + 8 * unit;
-}
-
-static int fixed_mtrr_seg_end_range_index(int seg)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
- int n;
-
- n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
- return mtrr_seg->range_start + n - 1;
-}
-
-static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
-{
- int seg, unit;
-
- if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
- return false;
-
- fixed_mtrr_seg_unit_range(seg, unit, start, end);
- return true;
-}
-
-static int fixed_msr_to_range_index(u32 msr)
-{
- int seg, unit;
-
- if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
- return -1;
-
- return fixed_mtrr_seg_unit_range_index(seg, unit);
-}
-
-static int fixed_mtrr_addr_to_seg(u64 addr)
-{
- struct fixed_mtrr_segment *mtrr_seg;
- int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
-
- for (seg = 0; seg < seg_num; seg++) {
- mtrr_seg = &fixed_seg_table[seg];
- if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
- return seg;
- }
-
- return -1;
-}
-
-static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
-{
- struct fixed_mtrr_segment *mtrr_seg;
- int index;
-
- mtrr_seg = &fixed_seg_table[seg];
- index = mtrr_seg->range_start;
- index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
- return index;
-}
-
-static u64 fixed_mtrr_range_end_addr(int seg, int index)
-{
- struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
- int pos = index - mtrr_seg->range_start;
-
- return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
-}
-
-static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
-{
- u64 mask;
-
- *start = range->base & PAGE_MASK;
-
- mask = range->mask & PAGE_MASK;
-
- /* This cannot overflow because writing to the reserved bits of
- * variable MTRRs causes a #GP.
- */
- *end = (*start | ~mask) + 1;
-}
-
-static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- gfn_t start, end;
-
- if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
- return;
-
- if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
- return;
-
- /* fixed MTRRs. */
- if (fixed_msr_to_range(msr, &start, &end)) {
- if (!fixed_mtrr_is_enabled(mtrr_state))
- return;
- } else if (msr == MSR_MTRRdefType) {
- start = 0x0;
- end = ~0ULL;
- } else {
- /* variable range MTRRs. */
- var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end);
- }
-
- kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
-}
-
-static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
-{
- return (range->mask & (1 << 11)) != 0;
-}
-
-static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- struct kvm_mtrr_range *tmp, *cur;
-
- cur = var_mtrr_msr_to_range(vcpu, msr);
-
- /* remove the entry if it's in the list. */
- if (var_mtrr_range_is_valid(cur))
- list_del(&cur->node);
-
- /*
- * Set all illegal GPA bits in the mask, since those bits must
- * implicitly be 0. The bits are then cleared when reading them.
- */
- if (is_mtrr_base_msr(msr))
- cur->base = data;
- else
- cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu);
-
- /* add it to the list if it's enabled. */
- if (var_mtrr_range_is_valid(cur)) {
- list_for_each_entry(tmp, &mtrr_state->head, node)
- if (cur->base >= tmp->base)
- break;
- list_add_tail(&cur->node, &tmp->node);
- }
+ return (data & mask) == 0;
}
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
- int index;
+ u64 *mtrr;
- if (!kvm_mtrr_valid(vcpu, msr, data))
+ mtrr = find_mtrr(vcpu, msr);
+ if (!mtrr)
return 1;
- index = fixed_msr_to_range_index(msr);
- if (index >= 0)
- *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
- else if (msr == MSR_MTRRdefType)
- vcpu->arch.mtrr_state.deftype = data;
- else
- set_var_mtrr_msr(vcpu, msr, data);
+ if (!kvm_mtrr_valid(vcpu, msr, data))
+ return 1;
- update_mtrr(vcpu, msr);
+ *mtrr = data;
return 0;
}
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
- int index;
+ u64 *mtrr;
/* MSR_MTRRcap is a readonly MSR. */
if (msr == MSR_MTRRcap) {
@@ -410,311 +123,10 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
- if (!msr_mtrr_valid(msr))
+ mtrr = find_mtrr(vcpu, msr);
+ if (!mtrr)
return 1;
- index = fixed_msr_to_range_index(msr);
- if (index >= 0) {
- *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
- } else if (msr == MSR_MTRRdefType) {
- *pdata = vcpu->arch.mtrr_state.deftype;
- } else {
- /* Variable MTRRs */
- if (is_mtrr_base_msr(msr))
- *pdata = var_mtrr_msr_to_range(vcpu, msr)->base;
- else
- *pdata = var_mtrr_msr_to_range(vcpu, msr)->mask;
-
- *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu);
- }
-
+ *pdata = *mtrr;
return 0;
}
-
-void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
-{
- INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
-}
-
-struct mtrr_iter {
- /* input fields. */
- struct kvm_mtrr *mtrr_state;
- u64 start;
- u64 end;
-
- /* output fields. */
- int mem_type;
- /* mtrr is completely disabled? */
- bool mtrr_disabled;
- /* [start, end) is not fully covered in MTRRs? */
- bool partial_map;
-
- /* private fields. */
- union {
- /* used for fixed MTRRs. */
- struct {
- int index;
- int seg;
- };
-
- /* used for var MTRRs. */
- struct {
- struct kvm_mtrr_range *range;
- /* max address has been covered in var MTRRs. */
- u64 start_max;
- };
- };
-
- bool fixed;
-};
-
-static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
-{
- int seg, index;
-
- if (!fixed_mtrr_is_enabled(iter->mtrr_state))
- return false;
-
- seg = fixed_mtrr_addr_to_seg(iter->start);
- if (seg < 0)
- return false;
-
- iter->fixed = true;
- index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
- iter->index = index;
- iter->seg = seg;
- return true;
-}
-
-static bool match_var_range(struct mtrr_iter *iter,
- struct kvm_mtrr_range *range)
-{
- u64 start, end;
-
- var_mtrr_range(range, &start, &end);
- if (!(start >= iter->end || end <= iter->start)) {
- iter->range = range;
-
- /*
- * the function is called when we do kvm_mtrr.head walking.
- * Range has the minimum base address which interleaves
- * [looker->start_max, looker->end).
- */
- iter->partial_map |= iter->start_max < start;
-
- /* update the max address has been covered. */
- iter->start_max = max(iter->start_max, end);
- return true;
- }
-
- return false;
-}
-
-static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
-{
- struct kvm_mtrr *mtrr_state = iter->mtrr_state;
-
- list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
- if (match_var_range(iter, iter->range))
- return;
-
- iter->range = NULL;
- iter->partial_map |= iter->start_max < iter->end;
-}
-
-static void mtrr_lookup_var_start(struct mtrr_iter *iter)
-{
- struct kvm_mtrr *mtrr_state = iter->mtrr_state;
-
- iter->fixed = false;
- iter->start_max = iter->start;
- iter->range = NULL;
- iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
-
- __mtrr_lookup_var_next(iter);
-}
-
-static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
-{
- /* terminate the lookup. */
- if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
- iter->fixed = false;
- iter->range = NULL;
- return;
- }
-
- iter->index++;
-
- /* have looked up for all fixed MTRRs. */
- if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
- return mtrr_lookup_var_start(iter);
-
- /* switch to next segment. */
- if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
- iter->seg++;
-}
-
-static void mtrr_lookup_var_next(struct mtrr_iter *iter)
-{
- __mtrr_lookup_var_next(iter);
-}
-
-static void mtrr_lookup_start(struct mtrr_iter *iter)
-{
- if (!mtrr_is_enabled(iter->mtrr_state)) {
- iter->mtrr_disabled = true;
- return;
- }
-
- if (!mtrr_lookup_fixed_start(iter))
- mtrr_lookup_var_start(iter);
-}
-
-static void mtrr_lookup_init(struct mtrr_iter *iter,
- struct kvm_mtrr *mtrr_state, u64 start, u64 end)
-{
- iter->mtrr_state = mtrr_state;
- iter->start = start;
- iter->end = end;
- iter->mtrr_disabled = false;
- iter->partial_map = false;
- iter->fixed = false;
- iter->range = NULL;
-
- mtrr_lookup_start(iter);
-}
-
-static bool mtrr_lookup_okay(struct mtrr_iter *iter)
-{
- if (iter->fixed) {
- iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
- return true;
- }
-
- if (iter->range) {
- iter->mem_type = iter->range->base & 0xff;
- return true;
- }
-
- return false;
-}
-
-static void mtrr_lookup_next(struct mtrr_iter *iter)
-{
- if (iter->fixed)
- mtrr_lookup_fixed_next(iter);
- else
- mtrr_lookup_var_next(iter);
-}
-
-#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
- for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
- mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
-
-u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- struct mtrr_iter iter;
- u64 start, end;
- int type = -1;
- const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
- | (1 << MTRR_TYPE_WRTHROUGH);
-
- start = gfn_to_gpa(gfn);
- end = start + PAGE_SIZE;
-
- mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
- int curr_type = iter.mem_type;
-
- /*
- * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
- * Precedences.
- */
-
- if (type == -1) {
- type = curr_type;
- continue;
- }
-
- /*
- * If two or more variable memory ranges match and the
- * memory types are identical, then that memory type is
- * used.
- */
- if (type == curr_type)
- continue;
-
- /*
- * If two or more variable memory ranges match and one of
- * the memory types is UC, the UC memory type used.
- */
- if (curr_type == MTRR_TYPE_UNCACHABLE)
- return MTRR_TYPE_UNCACHABLE;
-
- /*
- * If two or more variable memory ranges match and the
- * memory types are WT and WB, the WT memory type is used.
- */
- if (((1 << type) & wt_wb_mask) &&
- ((1 << curr_type) & wt_wb_mask)) {
- type = MTRR_TYPE_WRTHROUGH;
- continue;
- }
-
- /*
- * For overlaps not defined by the above rules, processor
- * behavior is undefined.
- */
-
- /* We use WB for this undefined behavior. :( */
- return MTRR_TYPE_WRBACK;
- }
-
- if (iter.mtrr_disabled)
- return mtrr_disabled_type(vcpu);
-
- /* not contained in any MTRRs. */
- if (type == -1)
- return mtrr_default_type(mtrr_state);
-
- /*
- * We just check one page, partially covered by MTRRs is
- * impossible.
- */
- WARN_ON(iter.partial_map);
-
- return type;
-}
-EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
-
-bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
- int page_num)
-{
- struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
- struct mtrr_iter iter;
- u64 start, end;
- int type = -1;
-
- start = gfn_to_gpa(gfn);
- end = gfn_to_gpa(gfn + page_num);
- mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
- if (type == -1) {
- type = iter.mem_type;
- continue;
- }
-
- if (type != iter.mem_type)
- return false;
- }
-
- if (iter.mtrr_disabled)
- return true;
-
- if (!iter.partial_map)
- return true;
-
- if (type == -1)
- return true;
-
- return type == mtrr_default_type(mtrr_state);
-}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index a593b03c9aed..47a46283c866 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -34,16 +34,16 @@ EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);
/* Precise Distribution of Instructions Retired (PDIR) */
static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, NULL),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, NULL),
/* Instruction-Accurate PDIR (PDIR++) */
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
{}
};
/* Precise Distribution (PDist) */
static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL),
{}
};
@@ -69,7 +69,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
* code. Each pmc, stored in kvm_pmc.idx field, is unique across
* all perf counters (both gp and fixed). The mapping relationship
* between pmc and perf counters is as the following:
- * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
+ * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters
* [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
* and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
@@ -194,7 +194,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
attr.sample_period = get_sample_period(pmc, pmc->counter);
if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
- guest_cpuid_is_intel(pmc->vcpu)) {
+ (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) {
/*
* HSW_IN_TX_CHECKPOINTED is not supported with nonzero
* period. Just clear the sample period so at least
@@ -469,11 +469,11 @@ static int reprogram_counter(struct kvm_pmc *pmc)
if (pmc_is_fixed(pmc)) {
fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
pmc->idx - KVM_FIXED_PMC_BASE_IDX);
- if (fixed_ctr_ctrl & 0x1)
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL)
eventsel |= ARCH_PERFMON_EVENTSEL_OS;
- if (fixed_ctr_ctrl & 0x2)
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_USER)
eventsel |= ARCH_PERFMON_EVENTSEL_USR;
- if (fixed_ctr_ctrl & 0x8)
+ if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI)
eventsel |= ARCH_PERFMON_EVENTSEL_INT;
new_config = (u64)fixed_ctr_ctrl;
}
@@ -521,9 +521,9 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
}
/*
- * Unused perf_events are only released if the corresponding MSRs
- * weren't accessed during the last vCPU time slice. kvm_arch_sched_in
- * triggers KVM_REQ_PMU if cleanup is needed.
+ * Release unused perf_events if the corresponding guest MSRs weren't
+ * accessed during the last vCPU time slice (need_cleanup is set when
+ * the vCPU is scheduled back in).
*/
if (unlikely(pmu->need_cleanup))
kvm_pmu_cleanup(vcpu);
@@ -542,7 +542,7 @@ int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
if (!kvm_pmu_ops.check_rdpmc_early)
return 0;
- return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx);
+ return kvm_pmu_call(check_rdpmc_early)(vcpu, idx);
}
bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -591,12 +591,12 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (is_vmware_backdoor_pmc(idx))
return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
- pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
+ pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
if (!pmc)
return 1;
if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
- (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
+ (kvm_x86_call(get_cpl)(vcpu) != 0) &&
kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
return 1;
@@ -607,7 +607,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
if (lapic_in_kernel(vcpu)) {
- static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
+ kvm_pmu_call(deliver_pmi)(vcpu);
kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
}
}
@@ -622,14 +622,14 @@ bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
default:
break;
}
- return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
- static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
+ return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) ||
+ kvm_pmu_call(is_valid_msr)(vcpu, msr);
}
static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
+ struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr);
if (pmc)
__set_bit(pmc->idx, pmu->pmc_in_use);
@@ -654,7 +654,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0;
break;
default:
- return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
+ return kvm_pmu_call(get_msr)(vcpu, msr_info);
}
return 0;
@@ -681,13 +681,13 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated)
break;
- if (data & pmu->global_status_mask)
+ if (data & pmu->global_status_rsvd)
return 1;
pmu->global_status = data;
break;
case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
- data &= ~pmu->global_ctrl_mask;
+ data &= ~pmu->global_ctrl_rsvd;
fallthrough;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (!kvm_valid_perf_global_ctrl(pmu, data))
@@ -704,7 +704,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
* GLOBAL_STATUS, and so the set of reserved bits is the same.
*/
- if (data & pmu->global_status_mask)
+ if (data & pmu->global_status_rsvd)
return 1;
fallthrough;
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
@@ -713,7 +713,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
default:
kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
- return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
+ return kvm_pmu_call(set_msr)(vcpu, msr_info);
}
return 0;
@@ -740,7 +740,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
- static_call_cond(kvm_x86_pmu_reset)(vcpu);
+ kvm_pmu_call(reset)(vcpu);
}
@@ -768,17 +768,17 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
pmu->raw_event_mask = X86_RAW_EVENT_MASK;
- pmu->global_ctrl_mask = ~0ull;
- pmu->global_status_mask = ~0ull;
- pmu->fixed_ctr_ctrl_mask = ~0ull;
- pmu->pebs_enable_mask = ~0ull;
- pmu->pebs_data_cfg_mask = ~0ull;
+ pmu->global_ctrl_rsvd = ~0ull;
+ pmu->global_status_rsvd = ~0ull;
+ pmu->fixed_ctr_ctrl_rsvd = ~0ull;
+ pmu->pebs_enable_rsvd = ~0ull;
+ pmu->pebs_data_cfg_rsvd = ~0ull;
bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
if (!vcpu->kvm->arch.enable_pmu)
return;
- static_call(kvm_x86_pmu_refresh)(vcpu);
+ kvm_pmu_call(refresh)(vcpu);
/*
* At RESET, both Intel and AMD CPUs set all enable bits for general
@@ -796,7 +796,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
memset(pmu, 0, sizeof(*pmu));
- static_call(kvm_x86_pmu_init)(vcpu);
+ kvm_pmu_call(init)(vcpu);
kvm_pmu_refresh(vcpu);
}
@@ -818,7 +818,7 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
pmc_stop_counter(pmc);
}
- static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
+ kvm_pmu_call(cleanup)(vcpu);
bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
}
@@ -846,8 +846,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc)
} else {
config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
pmc->idx - KVM_FIXED_PMC_BASE_IDX);
- select_os = config & 0x1;
- select_user = config & 0x2;
+ select_os = config & INTEL_FIXED_0_KERNEL;
+ select_user = config & INTEL_FIXED_0_USER;
}
/*
@@ -857,7 +857,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc)
if (select_os == select_user)
return select_os;
- return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
+ return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os :
+ select_user;
}
void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 4d52b0b539ba..ad89d0bd6005 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -14,7 +14,8 @@
MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
-#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+#define fixed_ctrl_field(ctrl_reg, idx) \
+ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
#define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000
#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
@@ -129,7 +130,7 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
u64 data)
{
- return !(pmu->global_ctrl_mask & data);
+ return !(pmu->global_ctrl_rsvd & data);
}
/* returns general purpose PMC with the specified MSR. Note that it can be
@@ -170,7 +171,8 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
if (pmc_is_fixed(pmc))
return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
- pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3;
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX) &
+ (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER);
return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
}
@@ -217,7 +219,7 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp,
pmu_ops->MAX_NR_GP_COUNTERS);
kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
- KVM_PMC_MAX_FIXED);
+ KVM_MAX_NR_FIXED_COUNTERS);
kvm_pmu_eventsel.INSTRUCTIONS_RETIRED =
perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS);
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index 2f4e155080ba..0d17d6b70639 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -17,6 +17,7 @@ enum kvm_only_cpuid_leafs {
CPUID_8000_0007_EDX,
CPUID_8000_0022_EAX,
CPUID_7_2_EDX,
+ CPUID_24_0_EBX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -46,6 +47,7 @@ enum kvm_only_cpuid_leafs {
#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8)
#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
+#define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19)
/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */
#define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0)
@@ -55,6 +57,11 @@ enum kvm_only_cpuid_leafs {
#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4)
#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5)
+/* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */
+#define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16)
+#define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17)
+#define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18)
+
/* CPUID level 0x80000007 (EDX). */
#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
@@ -90,6 +97,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
[CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
[CPUID_7_2_EDX] = { 7, 2, CPUID_EDX},
+ [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX},
};
/*
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index d06d43d8d2aa..85241c0c7f56 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -200,11 +200,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ kvm_x86_call(get_gdt)(vcpu, &dt);
smram->gdtr.base = dt.address;
smram->gdtr.limit = dt.size;
- static_call(kvm_x86_get_idt)(vcpu, &dt);
+ kvm_x86_call(get_idt)(vcpu, &dt);
smram->idtr.base = dt.address;
smram->idtr.limit = dt.size;
@@ -220,7 +220,7 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
smram->smm_revision = 0x00020000;
smram->smbase = vcpu->arch.smbase;
- smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
}
#ifdef CONFIG_X86_64
@@ -250,13 +250,13 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR);
- static_call(kvm_x86_get_idt)(vcpu, &dt);
+ kvm_x86_call(get_idt)(vcpu, &dt);
smram->idtr.limit = dt.size;
smram->idtr.base = dt.address;
enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR);
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ kvm_x86_call(get_gdt)(vcpu, &dt);
smram->gdtr.limit = dt.size;
smram->gdtr.base = dt.address;
@@ -267,7 +267,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS);
enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS);
- smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
}
#endif
@@ -297,7 +297,7 @@ void enter_smm(struct kvm_vcpu *vcpu)
* Kill the VM in the unlikely case of failure, because the VM
* can be in undefined state in this case.
*/
- if (static_call(kvm_x86_enter_smm)(vcpu, &smram))
+ if (kvm_x86_call(enter_smm)(vcpu, &smram))
goto error;
kvm_smm_changed(vcpu, true);
@@ -305,24 +305,24 @@ void enter_smm(struct kvm_vcpu *vcpu)
if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram)))
goto error;
- if (static_call(kvm_x86_get_nmi_mask)(vcpu))
+ if (kvm_x86_call(get_nmi_mask)(vcpu))
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
- static_call(kvm_x86_set_nmi_mask)(vcpu, true);
+ kvm_x86_call(set_nmi_mask)(vcpu, true);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0x8000);
- static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
- static_call(kvm_x86_set_cr0)(vcpu, cr0);
+ kvm_x86_call(set_cr0)(vcpu, cr0);
- static_call(kvm_x86_set_cr4)(vcpu, 0);
+ kvm_x86_call(set_cr4)(vcpu, 0);
/* Undocumented: IDT limit is set to zero on entry to SMM. */
dt.address = dt.size = 0;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
+ kvm_x86_call(set_idt)(vcpu, &dt);
if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1)))
goto error;
@@ -354,7 +354,7 @@ void enter_smm(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- if (static_call(kvm_x86_set_efer)(vcpu, 0))
+ if (kvm_x86_call(set_efer)(vcpu, 0))
goto error;
#endif
@@ -479,11 +479,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
dt.address = smstate->gdtr.base;
dt.size = smstate->gdtr.limit;
- static_call(kvm_x86_set_gdt)(vcpu, &dt);
+ kvm_x86_call(set_gdt)(vcpu, &dt);
dt.address = smstate->idtr.base;
dt.size = smstate->idtr.limit;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
+ kvm_x86_call(set_idt)(vcpu, &dt);
rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES);
rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS);
@@ -501,7 +501,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
if (r != X86EMUL_CONTINUE)
return r;
- static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
ctxt->interruptibility = (u8)smstate->int_shadow;
return r;
@@ -535,13 +535,13 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
dt.size = smstate->idtr.limit;
dt.address = smstate->idtr.base;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
+ kvm_x86_call(set_idt)(vcpu, &dt);
rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR);
dt.size = smstate->gdtr.limit;
dt.address = smstate->gdtr.base;
- static_call(kvm_x86_set_gdt)(vcpu, &dt);
+ kvm_x86_call(set_gdt)(vcpu, &dt);
r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4);
if (r != X86EMUL_CONTINUE)
@@ -554,7 +554,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS);
rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS);
- static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ kvm_x86_call(set_interrupt_shadow)(vcpu, 0);
ctxt->interruptibility = (u8)smstate->int_shadow;
return X86EMUL_CONTINUE;
@@ -576,7 +576,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
return X86EMUL_UNHANDLEABLE;
if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0)
- static_call(kvm_x86_set_nmi_mask)(vcpu, false);
+ kvm_x86_call(set_nmi_mask)(vcpu, false);
kvm_smm_changed(vcpu, false);
@@ -624,17 +624,31 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
#endif
/*
- * Give leave_smm() a chance to make ISA-specific changes to the vCPU
- * state (e.g. enter guest mode) before loading state from the SMM
- * state-save area.
+ * FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest
+ * mode should happen _after_ loading state from SMRAM. However, KVM
+ * piggybacks the nested VM-Enter flows (which is wrong for many other
+ * reasons), and so nSVM/nVMX would clobber state that is loaded from
+ * SMRAM and from the VMCS/VMCB.
*/
- if (static_call(kvm_x86_leave_smm)(vcpu, &smram))
+ if (kvm_x86_call(leave_smm)(vcpu, &smram))
return X86EMUL_UNHANDLEABLE;
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- return rsm_load_state_64(ctxt, &smram.smram64);
+ ret = rsm_load_state_64(ctxt, &smram.smram64);
else
#endif
- return rsm_load_state_32(ctxt, &smram.smram32);
+ ret = rsm_load_state_32(ctxt, &smram.smram32);
+
+ /*
+ * If RSM fails and triggers shutdown, architecturally the shutdown
+ * occurs *before* the transition to guest mode. But due to KVM's
+ * flawed handling of RSM to L2 (see above), the vCPU may already be
+ * in_guest_mode(). Force the vCPU out of guest mode before delivering
+ * the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit
+ * that architecturally shouldn't be possible.
+ */
+ if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu))
+ kvm_leave_nested(vcpu);
+ return ret;
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 55b9a6d96bcf..d5314cb7dff4 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1181,7 +1181,7 @@ int svm_allocate_nested(struct vcpu_svm *svm)
if (svm->nested.initialized)
return 0;
- vmcb02_page = snp_safe_alloc_page(&svm->vcpu);
+ vmcb02_page = snp_safe_alloc_page();
if (!vmcb02_page)
return -ENOMEM;
svm->nested.vmcb02.ptr = page_address(vmcb02_page);
@@ -1693,8 +1693,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
ret = -ENOMEM;
- ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT);
- save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
+ save = kzalloc(sizeof(*save), GFP_KERNEL);
if (!ctl || !save)
goto out_free;
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index dfcc38bd97d3..22d5a65b410c 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -199,8 +199,8 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
kvm_pmu_cap.num_counters_gp);
if (pmu->version > 1) {
- pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1);
- pmu->global_status_mask = pmu->global_ctrl_mask;
+ pmu->global_ctrl_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1);
+ pmu->global_status_rsvd = pmu->global_ctrl_rsvd;
}
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
@@ -217,10 +217,9 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
int i;
- BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE);
- BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC);
+ BUILD_BUG_ON(KVM_MAX_NR_AMD_GP_COUNTERS > AMD64_NUM_COUNTERS_CORE);
- for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) {
+ for (i = 0; i < KVM_MAX_NR_AMD_GP_COUNTERS; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
@@ -238,6 +237,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
.refresh = amd_pmu_refresh,
.init = amd_pmu_init,
.EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
- .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC,
+ .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS,
.MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS,
};
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 759581bb2128..0b851ef937f2 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -19,11 +19,14 @@
#include <linux/misc_cgroup.h>
#include <linux/processor.h>
#include <linux/trace_events.h>
+#include <uapi/linux/sev-guest.h>
#include <asm/pkru.h>
#include <asm/trapnr.h>
#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
+#include <asm/sev.h>
#include "mmu.h"
#include "x86.h"
@@ -32,22 +35,12 @@
#include "cpuid.h"
#include "trace.h"
-#ifndef CONFIG_KVM_AMD_SEV
-/*
- * When this config is not defined, SEV feature is not supported and APIs in
- * this file are not used but this file still gets compiled into the KVM AMD
- * module.
- *
- * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
- * misc_res_type {} defined in linux/misc_cgroup.h.
- *
- * Below macros allow compilation to succeed.
- */
-#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
-#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
-#endif
+#define GHCB_VERSION_MAX 2ULL
+#define GHCB_VERSION_DEFAULT 2ULL
+#define GHCB_VERSION_MIN 1ULL
+
+#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
-#ifdef CONFIG_KVM_AMD_SEV
/* enable/disable SEV support */
static bool sev_enabled = true;
module_param_named(sev, sev_enabled, bool, 0444);
@@ -56,14 +49,35 @@ module_param_named(sev, sev_enabled, bool, 0444);
static bool sev_es_enabled = true;
module_param_named(sev_es, sev_es_enabled, bool, 0444);
+/* enable/disable SEV-SNP support */
+static bool sev_snp_enabled = true;
+module_param_named(sev_snp, sev_snp_enabled, bool, 0444);
+
/* enable/disable SEV-ES DebugSwap support */
-static bool sev_es_debug_swap_enabled = false;
+static bool sev_es_debug_swap_enabled = true;
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
-#else
-#define sev_enabled false
-#define sev_es_enabled false
-#define sev_es_debug_swap_enabled false
-#endif /* CONFIG_KVM_AMD_SEV */
+static u64 sev_supported_vmsa_features;
+
+#define AP_RESET_HOLD_NONE 0
+#define AP_RESET_HOLD_NAE_EVENT 1
+#define AP_RESET_HOLD_MSR_PROTO 2
+
+/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */
+#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0)
+#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8)
+#define SNP_POLICY_MASK_SMT BIT_ULL(16)
+#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17)
+#define SNP_POLICY_MASK_DEBUG BIT_ULL(19)
+#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20)
+
+#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \
+ SNP_POLICY_MASK_API_MAJOR | \
+ SNP_POLICY_MASK_SMT | \
+ SNP_POLICY_MASK_RSVD_MBO | \
+ SNP_POLICY_MASK_DEBUG | \
+ SNP_POLICY_MASK_SINGLE_SOCKET)
+
+#define INITIAL_VMSA_GPA 0xFFFFFFFFF000
static u8 sev_enc_bit;
static DECLARE_RWSEM(sev_deactivate_lock);
@@ -75,6 +89,8 @@ static unsigned int nr_asids;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;
+static int snp_decommission_context(struct kvm *kvm);
+
struct enc_region {
struct list_head list;
unsigned long npages;
@@ -101,19 +117,32 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
down_write(&sev_deactivate_lock);
wbinvd_on_all_cpus();
- ret = sev_guest_df_flush(&error);
+
+ if (sev_snp_enabled)
+ ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error);
+ else
+ ret = sev_guest_df_flush(&error);
up_write(&sev_deactivate_lock);
if (ret)
- pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
+ pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n",
+ sev_snp_enabled ? "-SNP" : "", ret, error);
return ret;
}
static inline bool is_mirroring_enc_context(struct kvm *kvm)
{
- return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+ return !!to_kvm_sev_info(kvm)->enc_context_owner;
+}
+
+static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+
+ return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
}
/* Must be called with the sev_bitmap_lock held */
@@ -234,6 +263,53 @@ static void sev_decommission(unsigned int handle)
sev_guest_decommission(&decommission, NULL);
}
+/*
+ * Transition a page to hypervisor-owned/shared state in the RMP table. This
+ * should not fail under normal conditions, but leak the page should that
+ * happen since it will no longer be usable by the host due to RMP protections.
+ */
+static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level)
+{
+ if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) {
+ snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * Certain page-states, such as Pre-Guest and Firmware pages (as documented
+ * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be
+ * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE
+ * unless they are reclaimed first.
+ *
+ * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they
+ * might not be usable by the host due to being set as immutable or still
+ * being associated with a guest ASID.
+ *
+ * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be
+ * converted back to shared, as the page is no longer usable due to RMP
+ * protections, and it's infeasible for the guest to continue on.
+ */
+static int snp_page_reclaim(struct kvm *kvm, u64 pfn)
+{
+ struct sev_data_snp_page_reclaim data = {0};
+ int fw_err, rc;
+
+ data.paddr = __sme_set(pfn << PAGE_SHIFT);
+ rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err);
+ if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) {
+ snp_leak_pages(pfn, 1);
+ return -EIO;
+ }
+
+ if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K))
+ return -EIO;
+
+ return rc;
+}
+
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
{
struct sev_data_deactivate deactivate;
@@ -251,20 +327,119 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
sev_decommission(handle);
}
-static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+/*
+ * This sets up bounce buffers/firmware pages to handle SNP Guest Request
+ * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB
+ * 2.0 specification for more details.
+ *
+ * Technically, when an SNP Guest Request is issued, the guest will provide its
+ * own request/response pages, which could in theory be passed along directly
+ * to firmware rather than using bounce pages. However, these pages would need
+ * special care:
+ *
+ * - Both pages are from shared guest memory, so they need to be protected
+ * from migration/etc. occurring while firmware reads/writes to them. At a
+ * minimum, this requires elevating the ref counts and potentially needing
+ * an explicit pinning of the memory. This places additional restrictions
+ * on what type of memory backends userspace can use for shared guest
+ * memory since there is some reliance on using refcounted pages.
+ *
+ * - The response page needs to be switched to Firmware-owned[1] state
+ * before the firmware can write to it, which can lead to potential
+ * host RMP #PFs if the guest is misbehaved and hands the host a
+ * guest page that KVM might write to for other reasons (e.g. virtio
+ * buffers/etc.).
+ *
+ * Both of these issues can be avoided completely by using separately-allocated
+ * bounce pages for both the request/response pages and passing those to
+ * firmware instead. So that's what is being set up here.
+ *
+ * Guest requests rely on message sequence numbers to ensure requests are
+ * issued to firmware in the order the guest issues them, so concurrent guest
+ * requests generally shouldn't happen. But a misbehaved guest could issue
+ * concurrent guest requests in theory, so a mutex is used to serialize
+ * access to the bounce buffers.
+ *
+ * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more
+ * details on Firmware-owned pages, along with "RMP and VMPL Access Checks"
+ * in the APM for details on the related RMP restrictions.
+ */
+static int snp_guest_req_init(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ struct page *req_page;
+
+ req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!req_page)
+ return -ENOMEM;
+
+ sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!sev->guest_resp_buf) {
+ __free_page(req_page);
+ return -EIO;
+ }
+
+ sev->guest_req_buf = page_address(req_page);
+ mutex_init(&sev->guest_req_mutex);
+
+ return 0;
+}
+
+static void snp_guest_req_cleanup(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+
+ if (sev->guest_resp_buf)
+ snp_free_firmware_page(sev->guest_resp_buf);
+
+ if (sev->guest_req_buf)
+ __free_page(virt_to_page(sev->guest_req_buf));
+
+ sev->guest_req_buf = NULL;
+ sev->guest_resp_buf = NULL;
+}
+
+static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_init *data,
+ unsigned long vm_type)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_platform_init_args init_args = {0};
+ bool es_active = vm_type != KVM_X86_SEV_VM;
+ u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0;
int ret;
if (kvm->created_vcpus)
return -EINVAL;
+ if (data->flags)
+ return -EINVAL;
+
+ if (data->vmsa_features & ~valid_vmsa_features)
+ return -EINVAL;
+
+ if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version))
+ return -EINVAL;
+
if (unlikely(sev->active))
return -EINVAL;
sev->active = true;
- sev->es_active = argp->id == KVM_SEV_ES_INIT;
+ sev->es_active = es_active;
+ sev->vmsa_features = data->vmsa_features;
+ sev->ghcb_version = data->ghcb_version;
+
+ /*
+ * Currently KVM supports the full range of mandatory features defined
+ * by version 2 of the GHCB protocol, so default to that for SEV-ES
+ * guests created via KVM_SEV_INIT2.
+ */
+ if (sev->es_active && !sev->ghcb_version)
+ sev->ghcb_version = GHCB_VERSION_DEFAULT;
+
+ if (vm_type == KVM_X86_SNP_VM)
+ sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
+
ret = sev_asid_new(sev);
if (ret)
goto e_no_asid;
@@ -274,8 +449,13 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (ret)
goto e_free;
+ /* This needs to happen after SEV/SNP firmware initialization. */
+ if (vm_type == KVM_X86_SNP_VM && snp_guest_req_init(kvm))
+ goto e_free;
+
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
+ sev->need_init = false;
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
@@ -286,11 +466,54 @@ e_free:
sev_asid_free(sev);
sev->asid = 0;
e_no_asid:
+ sev->vmsa_features = 0;
sev->es_active = false;
sev->active = false;
return ret;
}
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_init data = {
+ .vmsa_features = 0,
+ .ghcb_version = 0,
+ };
+ unsigned long vm_type;
+
+ if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM)
+ return -EINVAL;
+
+ vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM);
+
+ /*
+ * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will
+ * continue to only ever support the minimal GHCB protocol version.
+ */
+ if (vm_type == KVM_X86_SEV_ES_VM)
+ data.ghcb_version = GHCB_VERSION_MIN;
+
+ return __sev_guest_init(kvm, argp, &data, vm_type);
+}
+
+static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_init data;
+
+ if (!sev->need_init)
+ return -EINVAL;
+
+ if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
+ kvm->arch.vm_type != KVM_X86_SEV_ES_VM &&
+ kvm->arch.vm_type != KVM_X86_SNP_VM)
+ return -EINVAL;
+
+ if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data)))
+ return -EFAULT;
+
+ return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type);
+}
+
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
unsigned int asid = sev_get_asid(kvm);
@@ -311,10 +534,10 @@ static int __sev_issue_cmd(int fd, int id, void *data, int *error)
int ret;
f = fdget(fd);
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- ret = sev_issue_cmd_external_user(f.file, id, data, error);
+ ret = sev_issue_cmd_external_user(fd_file(f), id, data, error);
fdput(f);
return ret;
@@ -339,7 +562,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
memset(&start, 0, sizeof(start));
@@ -383,7 +606,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* return handle to userspace */
params.handle = start.handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params))) {
sev_unbind_asid(kvm, start.handle);
ret = -EFAULT;
goto e_free_session;
@@ -522,7 +745,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
vaddr = params.uaddr;
@@ -580,7 +803,13 @@ e_unpin:
static int sev_es_sync_vmsa(struct vcpu_svm *svm)
{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
struct sev_es_save_area *save = svm->sev_es.vmsa;
+ struct xregs_state *xsave;
+ const u8 *s;
+ u8 *d;
+ int i;
/* Check some debug related fields before encrypting the VMSA */
if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
@@ -621,10 +850,44 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xss = svm->vcpu.arch.ia32_xss;
save->dr6 = svm->vcpu.arch.dr6;
- if (sev_es_debug_swap_enabled) {
- save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
- pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. "
- "This will not work starting with Linux 6.10\n");
+ save->sev_features = sev->vmsa_features;
+
+ /*
+ * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid
+ * breaking older measurements.
+ */
+ if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) {
+ xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave;
+ save->x87_dp = xsave->i387.rdp;
+ save->mxcsr = xsave->i387.mxcsr;
+ save->x87_ftw = xsave->i387.twd;
+ save->x87_fsw = xsave->i387.swd;
+ save->x87_fcw = xsave->i387.cwd;
+ save->x87_fop = xsave->i387.fop;
+ save->x87_ds = 0;
+ save->x87_cs = 0;
+ save->x87_rip = xsave->i387.rip;
+
+ for (i = 0; i < 8; i++) {
+ /*
+ * The format of the x87 save area is undocumented and
+ * definitely not what you would expect. It consists of
+ * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes
+ * area with bytes 8-9 of each register.
+ */
+ d = save->fpreg_x87 + i * 8;
+ s = ((u8 *)xsave->i387.st_space) + i * 16;
+ memcpy(d, s, 8);
+ save->fpreg_x87[64 + i * 2] = s[8];
+ save->fpreg_x87[64 + i * 2 + 1] = s[9];
+ }
+ memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256);
+
+ s = get_xsave_addr(xsave, XFEATURE_YMM);
+ if (s)
+ memcpy(save->fpreg_ymm, s, 256);
+ else
+ memset(save->fpreg_ymm, 0, 256);
}
pr_debug("Virtual Machine Save Area (VMSA):\n");
@@ -658,14 +921,29 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
vmsa.reserved = 0;
- vmsa.handle = to_kvm_svm(kvm)->sev_info.handle;
+ vmsa.handle = to_kvm_sev_info(kvm)->handle;
vmsa.address = __sme_pa(svm->sev_es.vmsa);
vmsa.len = PAGE_SIZE;
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
if (ret)
return ret;
+ /*
+ * SEV-ES guests maintain an encrypted version of their FPU
+ * state which is restored and saved on VMRUN and VMEXIT.
+ * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
+ * do xsave/xrstor on it.
+ */
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
vcpu->arch.guest_state_protected = true;
+
+ /*
+ * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it
+ * only after setting guest_state_protected because KVM_SET_MSRS allows
+ * dynamic toggling of LBRV (for performance reason) on write access to
+ * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
+ */
+ svm_enable_lbrv(vcpu);
return 0;
}
@@ -695,7 +973,7 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- void __user *measure = (void __user *)(uintptr_t)argp->data;
+ void __user *measure = u64_to_user_ptr(argp->data);
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_measure data;
struct kvm_sev_launch_measure params;
@@ -715,7 +993,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!params.len)
goto cmd;
- p = (void __user *)(uintptr_t)params.uaddr;
+ p = u64_to_user_ptr(params.uaddr);
if (p) {
if (params.len > SEV_FW_BLOB_MAX_SIZE)
return -EINVAL;
@@ -788,7 +1066,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
params.state = data.state;
params.handle = data.handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
ret = -EFAULT;
return ret;
@@ -953,7 +1231,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+ if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug)))
return -EFAULT;
if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
@@ -1037,7 +1315,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
@@ -1101,7 +1379,7 @@ e_unpin_memory:
static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- void __user *report = (void __user *)(uintptr_t)argp->data;
+ void __user *report = u64_to_user_ptr(argp->data);
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_attestation_report data;
struct kvm_sev_attestation_report params;
@@ -1112,7 +1390,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
memset(&data, 0, sizeof(data));
@@ -1121,7 +1399,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!params.len)
goto cmd;
- p = (void __user *)(uintptr_t)params.uaddr;
+ p = u64_to_user_ptr(params.uaddr);
if (p) {
if (params.len > SEV_FW_BLOB_MAX_SIZE)
return -EINVAL;
@@ -1174,7 +1452,7 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
params->session_len = data.session_len;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ if (copy_to_user(u64_to_user_ptr(argp->data), params,
sizeof(struct kvm_sev_send_start)))
ret = -EFAULT;
@@ -1193,7 +1471,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_send_start)))
return -EFAULT;
@@ -1248,7 +1526,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
- if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+ if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr),
session_data, params.session_len)) {
ret = -EFAULT;
goto e_free_amd_cert;
@@ -1256,7 +1534,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
params.policy = data.policy;
params.session_len = data.session_len;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params,
sizeof(struct kvm_sev_send_start)))
ret = -EFAULT;
@@ -1287,7 +1565,7 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
params->hdr_len = data.hdr_len;
params->trans_len = data.trans_len;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ if (copy_to_user(u64_to_user_ptr(argp->data), params,
sizeof(struct kvm_sev_send_update_data)))
ret = -EFAULT;
@@ -1307,7 +1585,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_send_update_data)))
return -EFAULT;
@@ -1358,14 +1636,14 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free_trans_data;
/* copy transport buffer to user space */
- if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+ if (copy_to_user(u64_to_user_ptr(params.trans_uaddr),
trans_data, params.trans_len)) {
ret = -EFAULT;
goto e_free_trans_data;
}
/* Copy packet header to userspace. */
- if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+ if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr,
params.hdr_len))
ret = -EFAULT;
@@ -1417,7 +1695,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
return -ENOTTY;
/* Get parameter from the userspace */
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_receive_start)))
return -EFAULT;
@@ -1459,7 +1737,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
}
params.handle = start.handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data,
+ if (copy_to_user(u64_to_user_ptr(argp->data),
&params, sizeof(struct kvm_sev_receive_start))) {
ret = -EFAULT;
sev_unbind_asid(kvm, start.handle);
@@ -1490,7 +1768,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -EINVAL;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_receive_update_data)))
return -EFAULT;
@@ -1705,6 +1983,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
dst->pages_locked = src->pages_locked;
dst->enc_context_owner = src->enc_context_owner;
dst->es_active = src->es_active;
+ dst->vmsa_features = src->vmsa_features;
src->asid = 0;
src->active = false;
@@ -1799,20 +2078,21 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
bool charged = false;
int ret;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (!file_is_kvm(f.file)) {
+ if (!file_is_kvm(fd_file(f))) {
ret = -EBADF;
goto out_fput;
}
- source_kvm = f.file->private_data;
+ source_kvm = fd_file(f)->private_data;
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto out_fput;
- if (sev_guest(kvm) || !sev_guest(source_kvm)) {
+ if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
+ sev_guest(kvm) || !sev_guest(source_kvm)) {
ret = -EINVAL;
goto out_unlock;
}
@@ -1861,6 +2141,427 @@ out_fput:
return ret;
}
+int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
+{
+ if (group != KVM_X86_GRP_SEV)
+ return -ENXIO;
+
+ switch (attr) {
+ case KVM_X86_SEV_VMSA_FEATURES:
+ *val = sev_supported_vmsa_features;
+ return 0;
+
+ default:
+ return -ENXIO;
+ }
+}
+
+/*
+ * The guest context contains all the information, keys and metadata
+ * associated with the guest that the firmware tracks to implement SEV
+ * and SNP features. The firmware stores the guest context in hypervisor
+ * provide page via the SNP_GCTX_CREATE command.
+ */
+static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct sev_data_snp_addr data = {};
+ void *context;
+ int rc;
+
+ /* Allocate memory for context page */
+ context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT);
+ if (!context)
+ return NULL;
+
+ data.address = __psp_pa(context);
+ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error);
+ if (rc) {
+ pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d",
+ rc, argp->error);
+ snp_free_firmware_page(context);
+ return NULL;
+ }
+
+ return context;
+}
+
+static int snp_bind_asid(struct kvm *kvm, int *error)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_activate data = {0};
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.asid = sev_get_asid(kvm);
+ return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error);
+}
+
+static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_launch_start start = {0};
+ struct kvm_sev_snp_launch_start params;
+ int rc;
+
+ if (!sev_snp_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ /* Don't allow userspace to allocate memory for more than 1 SNP context. */
+ if (sev->snp_context)
+ return -EINVAL;
+
+ sev->snp_context = snp_context_create(kvm, argp);
+ if (!sev->snp_context)
+ return -ENOTTY;
+
+ if (params.flags)
+ return -EINVAL;
+
+ if (params.policy & ~SNP_POLICY_MASK_VALID)
+ return -EINVAL;
+
+ /* Check for policy bits that must be set */
+ if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) ||
+ !(params.policy & SNP_POLICY_MASK_SMT))
+ return -EINVAL;
+
+ if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
+ return -EINVAL;
+
+ start.gctx_paddr = __psp_pa(sev->snp_context);
+ start.policy = params.policy;
+ memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
+ rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error);
+ if (rc) {
+ pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n",
+ __func__, rc);
+ goto e_free_context;
+ }
+
+ sev->fd = argp->sev_fd;
+ rc = snp_bind_asid(kvm, &argp->error);
+ if (rc) {
+ pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n",
+ __func__, rc);
+ goto e_free_context;
+ }
+
+ return 0;
+
+e_free_context:
+ snp_decommission_context(kvm);
+
+ return rc;
+}
+
+struct sev_gmem_populate_args {
+ __u8 type;
+ int sev_fd;
+ int fw_error;
+};
+
+static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
+ void __user *src, int order, void *opaque)
+{
+ struct sev_gmem_populate_args *sev_populate_args = opaque;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ int n_private = 0, ret, i;
+ int npages = (1 << order);
+ gfn_t gfn;
+
+ if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
+ return -EINVAL;
+
+ for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
+ struct sev_data_snp_launch_update fw_args = {0};
+ bool assigned = false;
+ int level;
+
+ ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
+ if (ret || assigned) {
+ pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
+ __func__, gfn, ret, assigned);
+ ret = ret ? -EINVAL : -EEXIST;
+ goto err;
+ }
+
+ if (src) {
+ void *vaddr = kmap_local_pfn(pfn + i);
+
+ if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
+ ret = -EFAULT;
+ goto err;
+ }
+ kunmap_local(vaddr);
+ }
+
+ ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
+ sev_get_asid(kvm), true);
+ if (ret)
+ goto err;
+
+ n_private++;
+
+ fw_args.gctx_paddr = __psp_pa(sev->snp_context);
+ fw_args.address = __sme_set(pfn_to_hpa(pfn + i));
+ fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K);
+ fw_args.page_type = sev_populate_args->type;
+
+ ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &fw_args, &sev_populate_args->fw_error);
+ if (ret)
+ goto fw_err;
+ }
+
+ return 0;
+
+fw_err:
+ /*
+ * If the firmware command failed handle the reclaim and cleanup of that
+ * PFN specially vs. prior pages which can be cleaned up below without
+ * needing to reclaim in advance.
+ *
+ * Additionally, when invalid CPUID function entries are detected,
+ * firmware writes the expected values into the page and leaves it
+ * unencrypted so it can be used for debugging and error-reporting.
+ *
+ * Copy this page back into the source buffer so userspace can use this
+ * information to provide information on which CPUID leaves/fields
+ * failed CPUID validation.
+ */
+ if (!snp_page_reclaim(kvm, pfn + i) &&
+ sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
+ sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
+ void *vaddr = kmap_local_pfn(pfn + i);
+
+ if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
+ pr_debug("Failed to write CPUID page back to userspace\n");
+
+ kunmap_local(vaddr);
+ }
+
+ /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
+ n_private--;
+
+err:
+ pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
+ __func__, ret, sev_populate_args->fw_error, n_private);
+ for (i = 0; i < n_private; i++)
+ kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K);
+
+ return ret;
+}
+
+static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_gmem_populate_args sev_populate_args = {0};
+ struct kvm_sev_snp_launch_update params;
+ struct kvm_memory_slot *memslot;
+ long npages, count;
+ void __user *src;
+ int ret = 0;
+
+ if (!sev_snp_guest(kvm) || !sev->snp_context)
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__,
+ params.gfn_start, params.len, params.type, params.flags);
+
+ if (!PAGE_ALIGNED(params.len) || params.flags ||
+ (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS &&
+ params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID))
+ return -EINVAL;
+
+ npages = params.len / PAGE_SIZE;
+
+ /*
+ * For each GFN that's being prepared as part of the initial guest
+ * state, the following pre-conditions are verified:
+ *
+ * 1) The backing memslot is a valid private memslot.
+ * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES
+ * beforehand.
+ * 3) The PFN of the guest_memfd has not already been set to private
+ * in the RMP table.
+ *
+ * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page
+ * faults if there's a race between a fault and an attribute update via
+ * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized
+ * here. However, kvm->slots_lock guards against both this as well as
+ * concurrent memslot updates occurring while these checks are being
+ * performed, so use that here to make it easier to reason about the
+ * initial expected state and better guard against unexpected
+ * situations.
+ */
+ mutex_lock(&kvm->slots_lock);
+
+ memslot = gfn_to_memslot(kvm, params.gfn_start);
+ if (!kvm_slot_can_be_private(memslot)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sev_populate_args.sev_fd = argp->sev_fd;
+ sev_populate_args.type = params.type;
+ src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr);
+
+ count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
+ sev_gmem_post_populate, &sev_populate_args);
+ if (count < 0) {
+ argp->error = sev_populate_args.fw_error;
+ pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n",
+ __func__, count, argp->error);
+ ret = -EIO;
+ } else {
+ params.gfn_start += count;
+ params.len -= count * PAGE_SIZE;
+ if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO)
+ params.uaddr += count * PAGE_SIZE;
+
+ ret = 0;
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
+ ret = -EFAULT;
+ }
+
+out:
+ mutex_unlock(&kvm->slots_lock);
+
+ return ret;
+}
+
+static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_launch_update data = {};
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
+ int ret;
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.page_type = SNP_PAGE_TYPE_VMSA;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ return ret;
+
+ /* Transition the VMSA page to a firmware state. */
+ ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true);
+ if (ret)
+ return ret;
+
+ /* Issue the SNP command to encrypt the VMSA */
+ data.address = __sme_pa(svm->sev_es.vmsa);
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
+ &data, &argp->error);
+ if (ret) {
+ snp_page_reclaim(kvm, pfn);
+
+ return ret;
+ }
+
+ svm->vcpu.arch.guest_state_protected = true;
+ /*
+ * SEV-ES (and thus SNP) guest mandates LBR Virtualization to
+ * be _always_ ON. Enable it only after setting
+ * guest_state_protected because KVM_SET_MSRS allows dynamic
+ * toggling of LBRV (for performance reason) on write access to
+ * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
+ */
+ svm_enable_lbrv(vcpu);
+ }
+
+ return 0;
+}
+
+static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_snp_launch_finish params;
+ struct sev_data_snp_launch_finish *data;
+ void *id_block = NULL, *id_auth = NULL;
+ int ret;
+
+ if (!sev_snp_guest(kvm))
+ return -ENOTTY;
+
+ if (!sev->snp_context)
+ return -EINVAL;
+
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
+ return -EFAULT;
+
+ if (params.flags)
+ return -EINVAL;
+
+ /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */
+ ret = snp_launch_update_vmsa(kvm, argp);
+ if (ret)
+ return ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ if (params.id_block_en) {
+ id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE);
+ if (IS_ERR(id_block)) {
+ ret = PTR_ERR(id_block);
+ goto e_free;
+ }
+
+ data->id_block_en = 1;
+ data->id_block_paddr = __sme_pa(id_block);
+
+ id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE);
+ if (IS_ERR(id_auth)) {
+ ret = PTR_ERR(id_auth);
+ goto e_free_id_block;
+ }
+
+ data->id_auth_paddr = __sme_pa(id_auth);
+
+ if (params.auth_key_en)
+ data->auth_key_en = 1;
+ }
+
+ data->vcek_disabled = params.vcek_disabled;
+
+ memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE);
+ data->gctx_paddr = __psp_pa(sev->snp_context);
+ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
+
+ /*
+ * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
+ * can be given to the guest simply by marking the RMP entry as private.
+ * This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
+ */
+ if (!ret)
+ kvm->arch.pre_fault_allowed = true;
+
+ kfree(id_auth);
+
+e_free_id_block:
+ kfree(id_block);
+
+e_free:
+ kfree(data);
+
+ return ret;
+}
+
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
@@ -1884,6 +2585,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
goto out;
}
+ /*
+ * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only
+ * allow the use of SNP-specific commands.
+ */
+ if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) {
+ r = -EPERM;
+ goto out;
+ }
+
switch (sev_cmd.id) {
case KVM_SEV_ES_INIT:
if (!sev_es_enabled) {
@@ -1894,6 +2604,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
case KVM_SEV_INIT:
r = sev_guest_init(kvm, &sev_cmd);
break;
+ case KVM_SEV_INIT2:
+ r = sev_guest_init2(kvm, &sev_cmd);
+ break;
case KVM_SEV_LAUNCH_START:
r = sev_launch_start(kvm, &sev_cmd);
break;
@@ -1945,6 +2658,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
case KVM_SEV_RECEIVE_FINISH:
r = sev_receive_finish(kvm, &sev_cmd);
break;
+ case KVM_SEV_SNP_LAUNCH_START:
+ r = snp_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_UPDATE:
+ r = snp_launch_update(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_SNP_LAUNCH_FINISH:
+ r = snp_launch_finish(kvm, &sev_cmd);
+ break;
default:
r = -EINVAL;
goto out;
@@ -2081,15 +2803,15 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
struct kvm_sev_info *source_sev, *mirror_sev;
int ret;
- if (!f.file)
+ if (!fd_file(f))
return -EBADF;
- if (!file_is_kvm(f.file)) {
+ if (!file_is_kvm(fd_file(f))) {
ret = -EBADF;
goto e_source_fput;
}
- source_kvm = f.file->private_data;
+ source_kvm = fd_file(f)->private_data;
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto e_source_fput;
@@ -2121,6 +2843,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
mirror_sev->asid = source_sev->asid;
mirror_sev->fd = source_sev->fd;
mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->need_init = false;
mirror_sev->handle = source_sev->handle;
INIT_LIST_HEAD(&mirror_sev->regions_list);
INIT_LIST_HEAD(&mirror_sev->mirror_vms);
@@ -2139,6 +2862,31 @@ e_source_fput:
return ret;
}
+static int snp_decommission_context(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_snp_addr data = {};
+ int ret;
+
+ /* If context is not created then do nothing */
+ if (!sev->snp_context)
+ return 0;
+
+ /* Do the decommision, which will unbind the ASID from the SNP context */
+ data.address = __sme_pa(sev->snp_context);
+ down_write(&sev_deactivate_lock);
+ ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
+ up_write(&sev_deactivate_lock);
+
+ if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret))
+ return ret;
+
+ snp_free_firmware_page(sev->snp_context);
+ sev->snp_context = NULL;
+
+ return 0;
+}
+
void sev_vm_destroy(struct kvm *kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -2180,22 +2928,42 @@ void sev_vm_destroy(struct kvm *kvm)
}
}
- sev_unbind_asid(kvm, sev->handle);
+ if (sev_snp_guest(kvm)) {
+ snp_guest_req_cleanup(kvm);
+
+ /*
+ * Decomission handles unbinding of the ASID. If it fails for
+ * some unexpected reason, just leak the ASID.
+ */
+ if (snp_decommission_context(kvm))
+ return;
+ } else {
+ sev_unbind_asid(kvm, sev->handle);
+ }
+
sev_asid_free(sev);
}
void __init sev_set_cpu_caps(void)
{
- if (!sev_enabled)
- kvm_cpu_cap_clear(X86_FEATURE_SEV);
- if (!sev_es_enabled)
- kvm_cpu_cap_clear(X86_FEATURE_SEV_ES);
+ if (sev_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM);
+ }
+ if (sev_es_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
+ }
+ if (sev_snp_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV_SNP);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM);
+ }
}
void __init sev_hardware_setup(void)
{
-#ifdef CONFIG_KVM_AMD_SEV
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
+ bool sev_snp_supported = false;
bool sev_es_supported = false;
bool sev_supported = false;
@@ -2269,6 +3037,12 @@ void __init sev_hardware_setup(void)
if (!boot_cpu_has(X86_FEATURE_SEV_ES))
goto out;
+ if (!lbrv) {
+ WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV),
+ "LBRV must be present for SEV-ES support");
+ goto out;
+ }
+
/* Has the system been allocated ASIDs for SEV-ES? */
if (min_sev_asid == 1)
goto out;
@@ -2276,6 +3050,7 @@ void __init sev_hardware_setup(void)
sev_es_asid_count = min_sev_asid - 1;
WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
sev_es_supported = true;
+ sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
out:
if (boot_cpu_has(X86_FEATURE_SEV))
@@ -2288,13 +3063,22 @@ out:
pr_info("SEV-ES %s (ASIDs %u - %u)\n",
sev_es_supported ? "enabled" : "disabled",
min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
+ if (boot_cpu_has(X86_FEATURE_SEV_SNP))
+ pr_info("SEV-SNP %s (ASIDs %u - %u)\n",
+ sev_snp_supported ? "enabled" : "disabled",
+ min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
sev_enabled = sev_supported;
sev_es_enabled = sev_es_supported;
+ sev_snp_enabled = sev_snp_supported;
+
if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
!cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
sev_es_debug_swap_enabled = false;
-#endif
+
+ sev_supported_vmsa_features = 0;
+ if (sev_es_debug_swap_enabled)
+ sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
}
void sev_hardware_unsetup(void)
@@ -2366,7 +3150,13 @@ do_wbinvd:
void sev_guest_memory_reclaimed(struct kvm *kvm)
{
- if (!sev_guest(kvm))
+ /*
+ * With SNP+gmem, private/encrypted memory is unreachable via the
+ * hva-based mmu notifiers, so these events are only actually
+ * pertaining to shared pages where there is no need to perform
+ * the WBINVD to flush associated caches.
+ */
+ if (!sev_guest(kvm) || sev_snp_guest(kvm))
return;
wbinvd_on_all_cpus();
@@ -2381,11 +3171,24 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
+ /*
+ * If it's an SNP guest, then the VMSA was marked in the RMP table as
+ * a guest-owned page. Transition the page to hypervisor state before
+ * releasing it back to the system.
+ */
+ if (sev_snp_guest(vcpu->kvm)) {
+ u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
+
+ if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K))
+ goto skip_vmsa_free;
+ }
+
if (vcpu->arch.guest_state_protected)
sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
__free_page(virt_to_page(svm->sev_es.vmsa));
+skip_vmsa_free:
if (svm->sev_es.ghcb_sa_free)
kvfree(svm->sev_es.ghcb_sa);
}
@@ -2581,10 +3384,31 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
if (!kvm_ghcb_sw_scratch_is_valid(svm))
goto vmgexit_err;
break;
+ case SVM_VMGEXIT_AP_CREATION:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto vmgexit_err;
+ if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
+ if (!kvm_ghcb_rax_is_valid(svm))
+ goto vmgexit_err;
+ break;
case SVM_VMGEXIT_NMI_COMPLETE:
case SVM_VMGEXIT_AP_HLT_LOOP:
case SVM_VMGEXIT_AP_JUMP_TABLE:
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ case SVM_VMGEXIT_HV_FEATURES:
+ case SVM_VMGEXIT_TERM_REQUEST:
+ break;
+ case SVM_VMGEXIT_PSC:
+ if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
+ goto vmgexit_err;
+ break;
+ case SVM_VMGEXIT_GUEST_REQUEST:
+ case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+ if (!sev_snp_guest(vcpu->kvm) ||
+ !PAGE_ALIGNED(control->exit_info_1) ||
+ !PAGE_ALIGNED(control->exit_info_2) ||
+ control->exit_info_1 == control->exit_info_2)
+ goto vmgexit_err;
break;
default:
reason = GHCB_ERR_INVALID_EVENT;
@@ -2615,6 +3439,9 @@ vmgexit_err:
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
{
+ /* Clear any indication that the vCPU is in a type of AP Reset Hold */
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE;
+
if (!svm->sev_es.ghcb)
return;
@@ -2770,10 +3597,539 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value)
svm->vmcb->control.ghcb_gpa = value;
}
+static int snp_rmptable_psmash(kvm_pfn_t pfn)
+{
+ int ret;
+
+ pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1);
+
+ /*
+ * PSMASH_FAIL_INUSE indicates another processor is modifying the
+ * entry, so retry until that's no longer the case.
+ */
+ do {
+ ret = psmash(pfn);
+ } while (ret == PSMASH_FAIL_INUSE);
+
+ return ret;
+}
+
+static int snp_complete_psc_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (vcpu->run->hypercall.ret)
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ else
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP);
+
+ return 1; /* resume guest */
+}
+
+static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr)
+{
+ u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr));
+ u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr);
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) {
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ return 1; /* resume guest */
+ }
+
+ if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) {
+ set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR);
+ return 1; /* resume guest */
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gpa;
+ vcpu->run->hypercall.args[1] = 1;
+ vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE)
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+
+ vcpu->arch.complete_userspace_io = snp_complete_psc_msr;
+
+ return 0; /* forward request to userspace */
+}
+
+struct psc_buffer {
+ struct psc_hdr hdr;
+ struct psc_entry entries[];
+} __packed;
+
+static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
+
+static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
+{
+ svm->sev_es.psc_inflight = 0;
+ svm->sev_es.psc_idx = 0;
+ svm->sev_es.psc_2m = false;
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
+}
+
+static void __snp_complete_one_psc(struct vcpu_svm *svm)
+{
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+ struct psc_entry *entries = psc->entries;
+ struct psc_hdr *hdr = &psc->hdr;
+ __u16 idx;
+
+ /*
+ * Everything in-flight has been processed successfully. Update the
+ * corresponding entries in the guest's PSC buffer and zero out the
+ * count of in-flight PSC entries.
+ */
+ for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
+ svm->sev_es.psc_inflight--, idx++) {
+ struct psc_entry *entry = &entries[idx];
+
+ entry->cur_page = entry->pagesize ? 512 : 1;
+ }
+
+ hdr->cur_entry = idx;
+}
+
+static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+
+ if (vcpu->run->hypercall.ret) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1; /* resume guest */
+ }
+
+ __snp_complete_one_psc(svm);
+
+ /* Handle the next range (if any). */
+ return snp_begin_psc(svm, psc);
+}
+
+static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
+{
+ struct psc_entry *entries = psc->entries;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct psc_hdr *hdr = &psc->hdr;
+ struct psc_entry entry_start;
+ u16 idx, idx_start, idx_end;
+ int npages;
+ bool huge;
+ u64 gfn;
+
+ if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+next_range:
+ /* There should be no other PSCs in-flight at this point. */
+ if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+ /*
+ * The PSC descriptor buffer can be modified by a misbehaved guest after
+ * validation, so take care to only use validated copies of values used
+ * for things like array indexing.
+ */
+ idx_start = hdr->cur_entry;
+ idx_end = hdr->end_entry;
+
+ if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR);
+ return 1;
+ }
+
+ /* Find the start of the next range which needs processing. */
+ for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
+ entry_start = entries[idx];
+
+ gfn = entry_start.gfn;
+ huge = entry_start.pagesize;
+ npages = huge ? 512 : 1;
+
+ if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY);
+ return 1;
+ }
+
+ if (entry_start.cur_page) {
+ /*
+ * If this is a partially-completed 2M range, force 4K handling
+ * for the remaining pages since they're effectively split at
+ * this point. Subsequent code should ensure this doesn't get
+ * combined with adjacent PSC entries where 2M handling is still
+ * possible.
+ */
+ npages -= entry_start.cur_page;
+ gfn += entry_start.cur_page;
+ huge = false;
+ }
+
+ if (npages)
+ break;
+ }
+
+ if (idx > idx_end) {
+ /* Nothing more to process. */
+ snp_complete_psc(svm, 0);
+ return 1;
+ }
+
+ svm->sev_es.psc_2m = huge;
+ svm->sev_es.psc_idx = idx;
+ svm->sev_es.psc_inflight = 1;
+
+ /*
+ * Find all subsequent PSC entries that contain adjacent GPA
+ * ranges/operations and can be combined into a single
+ * KVM_HC_MAP_GPA_RANGE exit.
+ */
+ while (++idx <= idx_end) {
+ struct psc_entry entry = entries[idx];
+
+ if (entry.operation != entry_start.operation ||
+ entry.gfn != entry_start.gfn + npages ||
+ entry.cur_page || !!entry.pagesize != huge)
+ break;
+
+ svm->sev_es.psc_inflight++;
+ npages += huge ? 512 : 1;
+ }
+
+ switch (entry_start.operation) {
+ case VMGEXIT_PSC_OP_PRIVATE:
+ case VMGEXIT_PSC_OP_SHARED:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
+ vcpu->run->hypercall.args[1] = npages;
+ vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= entry_start.pagesize
+ ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
+ : KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+ vcpu->arch.complete_userspace_io = snp_complete_one_psc;
+ return 0; /* forward request to userspace */
+ default:
+ /*
+ * Only shared/private PSC operations are currently supported, so if the
+ * entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
+ * then consider the entire range completed and avoid exiting to
+ * userspace. In theory snp_complete_psc() can always be called directly
+ * at this point to complete the current range and start the next one,
+ * but that could lead to unexpected levels of recursion.
+ */
+ __snp_complete_one_psc(svm);
+ goto next_range;
+ }
+
+ unreachable();
+}
+
+static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex));
+
+ /* Mark the vCPU as offline and not runnable */
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+
+ /* Clear use of the VMSA */
+ svm->vmcb->control.vmsa_pa = INVALID_PAGE;
+
+ if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) {
+ gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
+ struct kvm_memory_slot *slot;
+ kvm_pfn_t pfn;
+
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!slot)
+ return -EINVAL;
+
+ /*
+ * The new VMSA will be private memory guest memory, so
+ * retrieve the PFN from the gmem backend.
+ */
+ if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL))
+ return -EINVAL;
+
+ /*
+ * From this point forward, the VMSA will always be a
+ * guest-mapped page rather than the initial one allocated
+ * by KVM in svm->sev_es.vmsa. In theory, svm->sev_es.vmsa
+ * could be free'd and cleaned up here, but that involves
+ * cleanups like wbinvd_on_all_cpus() which would ideally
+ * be handled during teardown rather than guest boot.
+ * Deferring that also allows the existing logic for SEV-ES
+ * VMSAs to be re-used with minimal SNP-specific changes.
+ */
+ svm->sev_es.snp_has_guest_vmsa = true;
+
+ /* Use the new VMSA */
+ svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
+
+ /* Mark the vCPU as runnable */
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+
+ /*
+ * gmem pages aren't currently migratable, but if this ever
+ * changes then care should be taken to ensure
+ * svm->sev_es.vmsa is pinned through some other means.
+ */
+ kvm_release_pfn_clean(pfn);
+ }
+
+ /*
+ * When replacing the VMSA during SEV-SNP AP creation,
+ * mark the VMCB dirty so that full state is always reloaded.
+ */
+ vmcb_mark_all_dirty(svm->vmcb);
+
+ return 0;
+}
+
+/*
+ * Invoked as part of svm_vcpu_reset() processing of an init event.
+ */
+void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (!sev_snp_guest(vcpu->kvm))
+ return;
+
+ mutex_lock(&svm->sev_es.snp_vmsa_mutex);
+
+ if (!svm->sev_es.snp_ap_waiting_for_reset)
+ goto unlock;
+
+ svm->sev_es.snp_ap_waiting_for_reset = false;
+
+ ret = __sev_snp_update_protected_guest_state(vcpu);
+ if (ret)
+ vcpu_unimpl(vcpu, "snp: AP state update on init failed\n");
+
+unlock:
+ mutex_unlock(&svm->sev_es.snp_vmsa_mutex);
+}
+
+static int sev_snp_ap_creation(struct vcpu_svm *svm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_vcpu *target_vcpu;
+ struct vcpu_svm *target_svm;
+ unsigned int request;
+ unsigned int apic_id;
+ bool kick;
+ int ret;
+
+ request = lower_32_bits(svm->vmcb->control.exit_info_1);
+ apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
+
+ /* Validate the APIC ID */
+ target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
+ if (!target_vcpu) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
+ apic_id);
+ return -EINVAL;
+ }
+
+ ret = 0;
+
+ target_svm = to_svm(target_vcpu);
+
+ /*
+ * The target vCPU is valid, so the vCPU will be kicked unless the
+ * request is for CREATE_ON_INIT. For any errors at this stage, the
+ * kick will place the vCPU in an non-runnable state.
+ */
+ kick = true;
+
+ mutex_lock(&target_svm->sev_es.snp_vmsa_mutex);
+
+ target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
+ target_svm->sev_es.snp_ap_waiting_for_reset = true;
+
+ /* Interrupt injection mode shouldn't change for AP creation */
+ if (request < SVM_VMGEXIT_AP_DESTROY) {
+ u64 sev_features;
+
+ sev_features = vcpu->arch.regs[VCPU_REGS_RAX];
+ sev_features ^= sev->vmsa_features;
+
+ if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n",
+ vcpu->arch.regs[VCPU_REGS_RAX]);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ switch (request) {
+ case SVM_VMGEXIT_AP_CREATE_ON_INIT:
+ kick = false;
+ fallthrough;
+ case SVM_VMGEXIT_AP_CREATE:
+ if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
+ svm->vmcb->control.exit_info_2);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Malicious guest can RMPADJUST a large page into VMSA which
+ * will hit the SNP erratum where the CPU will incorrectly signal
+ * an RMP violation #PF if a hugepage collides with the RMP entry
+ * of VMSA page, reject the AP CREATE request if VMSA address from
+ * guest is 2M aligned.
+ */
+ if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
+ vcpu_unimpl(vcpu,
+ "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
+ svm->vmcb->control.exit_info_2);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
+ break;
+ case SVM_VMGEXIT_AP_DESTROY:
+ break;
+ default:
+ vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
+ request);
+ ret = -EINVAL;
+ break;
+ }
+
+out:
+ if (kick) {
+ kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
+ kvm_vcpu_kick(target_vcpu);
+ }
+
+ mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex);
+
+ return ret;
+}
+
+static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+ struct sev_data_snp_guest_request data = {0};
+ struct kvm *kvm = svm->vcpu.kvm;
+ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
+ sev_ret_code fw_err = 0;
+ int ret;
+
+ if (!sev_snp_guest(kvm))
+ return -EINVAL;
+
+ mutex_lock(&sev->guest_req_mutex);
+
+ if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ data.gctx_paddr = __psp_pa(sev->snp_context);
+ data.req_paddr = __psp_pa(sev->guest_req_buf);
+ data.res_paddr = __psp_pa(sev->guest_resp_buf);
+
+ /*
+ * Firmware failures are propagated on to guest, but any other failure
+ * condition along the way should be reported to userspace. E.g. if
+ * the PSP is dead and commands are timing out.
+ */
+ ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
+ if (ret && !fw_err)
+ goto out_unlock;
+
+ if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(0, fw_err));
+
+ ret = 1; /* resume guest */
+
+out_unlock:
+ mutex_unlock(&sev->guest_req_mutex);
+ return ret;
+}
+
+static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+ struct kvm *kvm = svm->vcpu.kvm;
+ u8 msg_type;
+
+ if (!sev_snp_guest(kvm))
+ return -EINVAL;
+
+ if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type),
+ &msg_type, 1))
+ return -EIO;
+
+ /*
+ * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for
+ * additional certificate data to be provided alongside the attestation
+ * report via the guest-provided data pages indicated by RAX/RBX. The
+ * certificate data is optional and requires additional KVM enablement
+ * to provide an interface for userspace to provide it, but KVM still
+ * needs to be able to handle extended guest requests either way. So
+ * provide a stub implementation that will always return an empty
+ * certificate table in the guest-provided data pages.
+ */
+ if (msg_type == SNP_MSG_REPORT_REQ) {
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u64 data_npages;
+ gpa_t data_gpa;
+
+ if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm))
+ goto request_invalid;
+
+ data_gpa = vcpu->arch.regs[VCPU_REGS_RAX];
+ data_npages = vcpu->arch.regs[VCPU_REGS_RBX];
+
+ if (!PAGE_ALIGNED(data_gpa))
+ goto request_invalid;
+
+ /*
+ * As per GHCB spec (see "SNP Extended Guest Request"), the
+ * certificate table is terminated by 24-bytes of zeroes.
+ */
+ if (data_npages && kvm_clear_guest(kvm, data_gpa, 24))
+ return -EIO;
+ }
+
+ return snp_handle_guest_req(svm, req_gpa, resp_gpa);
+
+request_invalid:
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+ return 1; /* resume guest */
+}
+
static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
u64 ghcb_info;
int ret = 1;
@@ -2784,7 +4140,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
switch (ghcb_info) {
case GHCB_MSR_SEV_INFO_REQ:
- set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
break;
@@ -2826,6 +4182,60 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
GHCB_MSR_INFO_POS);
break;
}
+ case GHCB_MSR_AP_RESET_HOLD_REQ:
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO;
+ ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+
+ /*
+ * Preset the result to a non-SIPI return and then only set
+ * the result to non-zero when delivering a SIPI.
+ */
+ set_ghcb_msr_bits(svm, 0,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_HV_FT_REQ:
+ set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED,
+ GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
+ GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_PREF_GPA_REQ:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_REG_GPA_REQ: {
+ u64 gfn;
+
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+
+ svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn);
+
+ set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK,
+ GHCB_MSR_GPA_VALUE_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ }
+ case GHCB_MSR_PSC_REQ:
+ if (!sev_snp_guest(vcpu->kvm))
+ goto out_terminate;
+
+ ret = snp_begin_psc_msr(svm, control->ghcb_gpa);
+ break;
case GHCB_MSR_TERM_REQ: {
u64 reason_set, reason_code;
@@ -2838,12 +4248,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
reason_set, reason_code);
- vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
- vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
- vcpu->run->system_event.ndata = 1;
- vcpu->run->system_event.data[0] = control->ghcb_gpa;
-
- return 0;
+ goto out_terminate;
}
default:
/* Error, keep GHCB MSR value as-is */
@@ -2854,6 +4259,14 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
control->ghcb_gpa, ret);
return ret;
+
+out_terminate:
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+
+ return 0;
}
int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
@@ -2889,6 +4302,13 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb);
sev_es_sync_from_ghcb(svm);
+
+ /* SEV-SNP guest requires that the GHCB GPA must be registered */
+ if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) {
+ vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa);
+ return -EINVAL;
+ }
+
ret = sev_es_validate_vmgexit(svm);
if (ret)
return ret;
@@ -2925,6 +4345,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ret = 1;
break;
case SVM_VMGEXIT_AP_HLT_LOOP:
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT;
ret = kvm_emulate_ap_reset_hold(vcpu);
break;
case SVM_VMGEXIT_AP_JUMP_TABLE: {
@@ -2949,6 +4370,41 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ret = 1;
break;
}
+ case SVM_VMGEXIT_HV_FEATURES:
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_HV_FT_SUPPORTED);
+
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_TERM_REQUEST:
+ pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
+ control->exit_info_1, control->exit_info_2);
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+ break;
+ case SVM_VMGEXIT_PSC:
+ ret = setup_vmgexit_scratch(svm, true, control->exit_info_2);
+ if (ret)
+ break;
+
+ ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa);
+ break;
+ case SVM_VMGEXIT_AP_CREATION:
+ ret = sev_snp_ap_creation(svm);
+ if (ret) {
+ ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
+ }
+
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_GUEST_REQUEST:
+ ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
+ break;
+ case SVM_VMGEXIT_EXT_GUEST_REQUEST:
+ ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2);
+ break;
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
@@ -3034,7 +4490,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
- svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
/*
* An SEV-ES guest requires a VMSA area that is a separate from the
@@ -3043,7 +4498,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
* the VMSA will be NULL if this vCPU is the destination for intrahost
* migration, and will be copied later.
*/
- if (svm->sev_es.vmsa)
+ if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa)
svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
/* Can't intercept CR register access, HV can't modify CR registers */
@@ -3063,7 +4518,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
svm_set_intercept(svm, TRAP_CR8_WRITE);
vmcb->control.intercepts[INTERCEPT_DR] = 0;
- if (!sev_es_debug_swap_enabled) {
+ if (!sev_vcpu_has_debug_swap(svm)) {
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
recalc_intercepts(svm);
@@ -3086,10 +4541,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
/* Clear intercepts on selected MSRs */
set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
}
void sev_init_vmcb(struct vcpu_svm *svm)
@@ -3109,16 +4560,21 @@ void sev_init_vmcb(struct vcpu_svm *svm)
void sev_es_vcpu_reset(struct vcpu_svm *svm)
{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+
/*
* Set the GHCB MSR value as per the GHCB specification when emulating
* vCPU RESET for an SEV-ES guest.
*/
- set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
+
+ mutex_init(&svm->sev_es.snp_vmsa_mutex);
}
-void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
+void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
{
/*
* All host state for SEV-ES guests is categorized into three swap types
@@ -3137,16 +4593,16 @@ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
* isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
* by common SVM code).
*/
- hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ hostsa->xcr0 = kvm_host.xcr0;
hostsa->pkru = read_pkru();
- hostsa->xss = host_xss;
+ hostsa->xss = kvm_host.xss;
/*
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
* saves and loads debug registers (Type-A).
*/
- if (sev_es_debug_swap_enabled) {
+ if (sev_vcpu_has_debug_swap(svm)) {
hostsa->dr0 = native_get_debugreg(0);
hostsa->dr1 = native_get_debugreg(1);
hostsa->dr2 = native_get_debugreg(2);
@@ -3168,24 +4624,40 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
return;
}
- /*
- * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where
- * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
- * non-zero value.
- */
- if (!svm->sev_es.ghcb)
- return;
+ /* Subsequent SIPI */
+ switch (svm->sev_es.ap_reset_hold_type) {
+ case AP_RESET_HOLD_NAE_EVENT:
+ /*
+ * Return from an AP Reset Hold VMGEXIT, where the guest will
+ * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
+ */
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+ break;
+ case AP_RESET_HOLD_MSR_PROTO:
+ /*
+ * Return from an AP Reset Hold VMGEXIT, where the guest will
+ * set the CS and RIP. Set GHCB data field to a non-zero value.
+ */
+ set_ghcb_msr_bits(svm, 1,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ default:
+ break;
+ }
}
-struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
+struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
{
unsigned long pfn;
struct page *p;
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
- return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
/*
* Allocate an SNP-safe page to workaround the SNP erratum where
@@ -3196,7 +4668,7 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
* Allocate one extra page, choose a page which is not
* 2MB-aligned, and free the other.
*/
- p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+ p = alloc_pages_node(node, gfp | __GFP_ZERO, 1);
if (!p)
return NULL;
@@ -3210,3 +4682,271 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
return p;
}
+
+void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm *kvm = vcpu->kvm;
+ int order, rmp_level, ret;
+ bool assigned;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+
+ gfn = gpa >> PAGE_SHIFT;
+
+ /*
+ * The only time RMP faults occur for shared pages is when the guest is
+ * triggering an RMP fault for an implicit page-state change from
+ * shared->private. Implicit page-state changes are forwarded to
+ * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults
+ * for shared pages should not end up here.
+ */
+ if (!kvm_mem_is_private(kvm, gfn)) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!kvm_slot_can_be_private(slot)) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order);
+ if (ret) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
+ gpa);
+ return;
+ }
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (ret || !assigned) {
+ pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
+ gpa, pfn, ret);
+ goto out_no_trace;
+ }
+
+ /*
+ * There are 2 cases where a PSMASH may be needed to resolve an #NPF
+ * with PFERR_GUEST_RMP_BIT set:
+ *
+ * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM
+ * bit set if the guest issues them with a smaller granularity than
+ * what is indicated by the page-size bit in the 2MB RMP entry for
+ * the PFN that backs the GPA.
+ *
+ * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is
+ * smaller than what is indicated by the 2MB RMP entry for the PFN
+ * that backs the GPA.
+ *
+ * In both these cases, the corresponding 2M RMP entry needs to
+ * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already
+ * split into 4K RMP entries, then this is likely a spurious case which
+ * can occur when there are concurrent accesses by the guest to a 2MB
+ * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in
+ * the process of being PMASH'd into 4K entries. These cases should
+ * resolve automatically on subsequent accesses, so just ignore them
+ * here.
+ */
+ if (rmp_level == PG_LEVEL_4K)
+ goto out;
+
+ ret = snp_rmptable_psmash(pfn);
+ if (ret) {
+ /*
+ * Look it up again. If it's 4K now then the PSMASH may have
+ * raced with another process and the issue has already resolved
+ * itself.
+ */
+ if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) &&
+ assigned && rmp_level == PG_LEVEL_4K)
+ goto out;
+
+ pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
+ gpa, pfn, ret);
+ }
+
+ kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD);
+out:
+ trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret);
+out_no_trace:
+ put_page(pfn_to_page(pfn));
+}
+
+static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_pfn_t pfn = start;
+
+ while (pfn < end) {
+ int ret, rmp_level;
+ bool assigned;
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (ret) {
+ pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n",
+ pfn, start, end, rmp_level, ret);
+ return false;
+ }
+
+ if (assigned) {
+ pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n",
+ __func__, pfn, start, end, rmp_level);
+ return false;
+ }
+
+ pfn++;
+ }
+
+ return true;
+}
+
+static u8 max_level_for_order(int order)
+{
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+ return PG_LEVEL_2M;
+
+ return PG_LEVEL_4K;
+}
+
+static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order)
+{
+ kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+
+ /*
+ * If this is a large folio, and the entire 2M range containing the
+ * PFN is currently shared, then the entire 2M-aligned range can be
+ * set to private via a single 2M RMP entry.
+ */
+ if (max_level_for_order(order) > PG_LEVEL_4K &&
+ is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD))
+ return true;
+
+ return false;
+}
+
+int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ kvm_pfn_t pfn_aligned;
+ gfn_t gfn_aligned;
+ int level, rc;
+ bool assigned;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (rc) {
+ pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
+ gfn, pfn, rc);
+ return -ENOENT;
+ }
+
+ if (assigned) {
+ pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n",
+ __func__, gfn, pfn, max_order, level);
+ return 0;
+ }
+
+ if (is_large_rmp_possible(kvm, pfn, max_order)) {
+ level = PG_LEVEL_2M;
+ pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+ gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD);
+ } else {
+ level = PG_LEVEL_4K;
+ pfn_aligned = pfn;
+ gfn_aligned = gfn;
+ }
+
+ rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false);
+ if (rc) {
+ pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n",
+ gfn, pfn, level, rc);
+ return -EINVAL;
+ }
+
+ pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n",
+ __func__, gfn, pfn, pfn_aligned, max_order, level);
+
+ return 0;
+}
+
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_pfn_t pfn;
+
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return;
+
+ pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end);
+
+ for (pfn = start; pfn < end;) {
+ bool use_2m_update = false;
+ int rc, rmp_level;
+ bool assigned;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level);
+ if (rc || !assigned)
+ goto next_pfn;
+
+ use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) &&
+ end >= (pfn + PTRS_PER_PMD) &&
+ rmp_level > PG_LEVEL_4K;
+
+ /*
+ * If an unaligned PFN corresponds to a 2M region assigned as a
+ * large page in the RMP table, PSMASH the region into individual
+ * 4K RMP entries before attempting to convert a 4K sub-page.
+ */
+ if (!use_2m_update && rmp_level > PG_LEVEL_4K) {
+ /*
+ * This shouldn't fail, but if it does, report it, but
+ * still try to update RMP entry to shared and pray this
+ * was a spurious error that can be addressed later.
+ */
+ rc = snp_rmptable_psmash(pfn);
+ WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
+ pfn, rc);
+ }
+
+ rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K);
+ if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
+ pfn, rc))
+ goto next_pfn;
+
+ /*
+ * SEV-ES avoids host/guest cache coherency issues through
+ * WBINVD hooks issued via MMU notifiers during run-time, and
+ * KVM's VM destroy path at shutdown. Those MMU notifier events
+ * don't cover gmem since there is no requirement to map pages
+ * to a HVA in order to use them for a running guest. While the
+ * shutdown path would still likely cover things for SNP guests,
+ * userspace may also free gmem pages during run-time via
+ * hole-punching operations on the guest_memfd, so flush the
+ * cache entries for these pages before free'ing them back to
+ * the host.
+ */
+ clflush_cache_range(__va(pfn_to_hpa(pfn)),
+ use_2m_update ? PMD_SIZE : PAGE_SIZE);
+next_pfn:
+ pfn += use_2m_update ? PTRS_PER_PMD : 1;
+ cond_resched();
+ }
+}
+
+int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ int level, rc;
+ bool assigned;
+
+ if (!sev_snp_guest(kvm))
+ return 0;
+
+ rc = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (rc || !assigned)
+ return PG_LEVEL_4K;
+
+ return level;
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9aaf83c8d57d..9df3e1e5ae81 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -53,6 +53,7 @@
#include "svm_onhyperv.h"
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions");
MODULE_LICENSE("GPL");
#ifdef MODULE
@@ -99,6 +100,7 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_IA32_SPEC_CTRL, .always = false },
{ .index = MSR_IA32_PRED_CMD, .always = false },
{ .index = MSR_IA32_FLUSH_CMD, .always = false },
+ { .index = MSR_IA32_DEBUGCTLMSR, .always = false },
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
@@ -215,7 +217,7 @@ int vgif = true;
module_param(vgif, int, 0444);
/* enable/disable LBR virtualization */
-static int lbrv = true;
+int lbrv = true;
module_param(lbrv, int, 0444);
static int tsc_scaling = true;
@@ -569,6 +571,11 @@ static void __svm_write_tsc_multiplier(u64 multiplier)
__this_cpu_write(current_tsc_ratio, multiplier);
}
+static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
+{
+ return &sd->save_area->host_sev_es_save;
+}
+
static inline void kvm_cpu_svm_disable(void)
{
uint64_t efer;
@@ -585,14 +592,14 @@ static inline void kvm_cpu_svm_disable(void)
}
}
-static void svm_emergency_disable(void)
+static void svm_emergency_disable_virtualization_cpu(void)
{
kvm_rebooting = true;
kvm_cpu_svm_disable();
}
-static void svm_hardware_disable(void)
+static void svm_disable_virtualization_cpu(void)
{
/* Make sure we clean up behind us */
if (tsc_scaling)
@@ -603,7 +610,7 @@ static void svm_hardware_disable(void)
amd_pmu_disable_virt();
}
-static int svm_hardware_enable(void)
+static int svm_enable_virtualization_cpu(void)
{
struct svm_cpu_data *sd;
@@ -673,12 +680,9 @@ static int svm_hardware_enable(void)
* TSC_AUX field now to avoid a RDMSR on every vCPU run.
*/
if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
- struct sev_es_save_area *hostsa;
u32 __maybe_unused msr_hi;
- hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
-
- rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi);
+ rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
}
return 0;
@@ -692,7 +696,7 @@ static void svm_cpu_uninit(int cpu)
return;
kfree(sd->sev_vmcbs);
- __free_page(sd->save_area);
+ __free_page(__sme_pa_to_page(sd->save_area_pa));
sd->save_area_pa = 0;
sd->save_area = NULL;
}
@@ -700,23 +704,24 @@ static void svm_cpu_uninit(int cpu)
static int svm_cpu_init(int cpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ struct page *save_area_page;
int ret = -ENOMEM;
memset(sd, 0, sizeof(struct svm_cpu_data));
- sd->save_area = snp_safe_alloc_page(NULL);
- if (!sd->save_area)
+ save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL);
+ if (!save_area_page)
return ret;
ret = sev_cpu_init(sd);
if (ret)
goto free_save_area;
- sd->save_area_pa = __sme_page_pa(sd->save_area);
+ sd->save_area = page_address(save_area_page);
+ sd->save_area_pa = __sme_page_pa(save_area_page);
return 0;
free_save_area:
- __free_page(sd->save_area);
- sd->save_area = NULL;
+ __free_page(save_area_page);
return ret;
}
@@ -990,7 +995,7 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
vmcb_mark_dirty(to_vmcb, VMCB_LBR);
}
-static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+void svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1000,6 +1005,9 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+ if (sev_es_guest(vcpu->kvm))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1);
+
/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
if (is_guest_mode(vcpu))
svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
@@ -1009,6 +1017,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
+
svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
@@ -1115,8 +1125,7 @@ static void svm_hardware_unsetup(void)
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
- __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
- get_order(IOPM_SIZE));
+ __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE));
iopm_base = 0;
}
@@ -1196,7 +1205,7 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (guest_cpuid_is_intel(vcpu)) {
+ if (guest_cpuid_is_intel_compatible(vcpu)) {
/*
* We must intercept SYSENTER_EIP and SYSENTER_ESP
* accesses because the processor only stores 32 bits.
@@ -1292,7 +1301,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
if (!kvm_hlt_in_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_HLT);
- control->iopm_base_pa = __sme_set(iopm_base);
+ control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
control->int_ctl = V_INTR_MASKING_MASK;
@@ -1398,6 +1407,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
svm->spec_ctrl = 0;
svm->virt_spec_ctrl = 0;
+ if (init_event)
+ sev_snp_init_protected_guest_state(vcpu);
+
init_vmcb(vcpu);
if (!init_event)
@@ -1421,7 +1433,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
err = -ENOMEM;
- vmcb01_page = snp_safe_alloc_page(vcpu);
+ vmcb01_page = snp_safe_alloc_page();
if (!vmcb01_page)
goto out;
@@ -1430,17 +1442,9 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
* SEV-ES guests require a separate VMSA page used to contain
* the encrypted register state of the guest.
*/
- vmsa_page = snp_safe_alloc_page(vcpu);
+ vmsa_page = snp_safe_alloc_page();
if (!vmsa_page)
goto error_free_vmcb_page;
-
- /*
- * SEV-ES guests maintain an encrypted version of their FPU
- * state which is restored and saved on VMRUN and VMEXIT.
- * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
- * do xsave/xrstor on it.
- */
- fpstate_set_confidential(&vcpu->arch.guest_fpu);
}
err = avic_init_vcpu(svm);
@@ -1499,15 +1503,10 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
sev_free_vcpu(vcpu);
- __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
+ __free_page(__sme_pa_to_page(svm->vmcb01.pa));
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
-static struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
-{
- return page_address(sd->save_area) + 0x400;
-}
-
static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1525,7 +1524,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
*/
vmsave(sd->save_area_pa);
if (sev_es_guest(vcpu->kvm))
- sev_es_prepare_switch_to_guest(sev_es_host_save_area(sd));
+ sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd));
if (tsc_scaling)
__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
@@ -1534,7 +1533,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
* TSC_AUX is always virtualized for SEV-ES guests when the feature is
* available. The user return MSR support is not required in this case
* because TSC_AUX is restored on #VMEXIT from the host save area
- * (which has been initialized in svm_hardware_enable()).
+ * (which has been initialized in svm_enable_virtualization_cpu()).
*/
if (likely(tsc_aux_uret_slot >= 0) &&
(!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
@@ -1553,6 +1552,9 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
@@ -2052,15 +2054,33 @@ static int pf_interception(struct kvm_vcpu *vcpu)
static int npf_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ int rc;
u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;
+ /*
+ * WARN if hardware generates a fault with an error code that collides
+ * with KVM-defined sythentic flags. Clear the flags and continue on,
+ * i.e. don't terminate the VM, as KVM can't possibly be relying on a
+ * flag that KVM doesn't know about.
+ */
+ if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
+ error_code &= ~PFERR_SYNTHETIC_MASK;
+
+ if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
trace_kvm_page_fault(vcpu, fault_address, error_code);
- return kvm_mmu_page_fault(vcpu, fault_address, error_code,
- static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
- svm->vmcb->control.insn_bytes : NULL,
- svm->vmcb->control.insn_len);
+ rc = kvm_mmu_page_fault(vcpu, fault_address, error_code,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+
+ if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK)
+ sev_handle_rmp_fault(vcpu, fault_address, error_code);
+
+ return rc;
}
static int db_interception(struct kvm_vcpu *vcpu)
@@ -2805,26 +2825,40 @@ static int efer_trap(struct kvm_vcpu *vcpu)
return kvm_complete_insn_gp(vcpu, ret);
}
-static int svm_get_msr_feature(struct kvm_msr_entry *msr)
+static int svm_get_feature_msr(u32 msr, u64 *data)
{
- msr->data = 0;
+ *data = 0;
- switch (msr->index) {
+ switch (msr) {
case MSR_AMD64_DE_CFG:
if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
- msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
+ *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
break;
default:
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
return 0;
}
+static bool
+sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ return sev_es_guest(vcpu->kvm) &&
+ vcpu->arch.guest_state_protected &&
+ svm_msrpm_offset(msr_info->index) != MSR_INVALID &&
+ !msr_write_intercepted(vcpu, msr_info->index);
+}
+
static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (sev_es_prevent_msr_access(vcpu, msr_info)) {
+ msr_info->data = 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
+ }
+
switch (msr_info->index) {
case MSR_AMD64_TSC_RATIO:
if (!msr_info->host_initiated &&
@@ -2842,6 +2876,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CSTAR:
msr_info->data = svm->vmcb01.ptr->save.cstar;
break;
+ case MSR_GS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.gs.base;
+ break;
+ case MSR_FS_BASE:
+ msr_info->data = svm->vmcb01.ptr->save.fs.base;
+ break;
case MSR_KERNEL_GS_BASE:
msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
break;
@@ -2854,12 +2894,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SYSENTER_EIP:
msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
- if (guest_cpuid_is_intel(vcpu))
+ if (guest_cpuid_is_intel_compatible(vcpu))
msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
break;
case MSR_IA32_SYSENTER_ESP:
msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
- if (guest_cpuid_is_intel(vcpu))
+ if (guest_cpuid_is_intel_compatible(vcpu))
msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
break;
case MSR_TSC_AUX:
@@ -2975,6 +3015,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
u32 ecx = msr->index;
u64 data = msr->data;
+
+ if (sev_es_prevent_msr_access(vcpu, msr))
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
+
switch (ecx) {
case MSR_AMD64_TSC_RATIO:
@@ -3063,6 +3107,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_CSTAR:
svm->vmcb01.ptr->save.cstar = data;
break;
+ case MSR_GS_BASE:
+ svm->vmcb01.ptr->save.gs.base = data;
+ break;
+ case MSR_FS_BASE:
+ svm->vmcb01.ptr->save.fs.base = data;
+ break;
case MSR_KERNEL_GS_BASE:
svm->vmcb01.ptr->save.kernel_gs_base = data;
break;
@@ -3082,11 +3132,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* 32 bit part of these msrs to support Intel's
* implementation of SYSENTER/SYSEXIT.
*/
- svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
+ svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
break;
case MSR_IA32_SYSENTER_ESP:
svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
- svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
+ svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
break;
case MSR_TSC_AUX:
/*
@@ -3094,7 +3144,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* feature is available. The user return MSR support is not
* required in this case because TSC_AUX is restored on #VMEXIT
* from the host save area (which has been initialized in
- * svm_hardware_enable()).
+ * svm_enable_virtualization_cpu()).
*/
if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
break;
@@ -3141,18 +3191,21 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
case MSR_AMD64_DE_CFG: {
- struct kvm_msr_entry msr_entry;
+ u64 supported_de_cfg;
- msr_entry.index = msr->index;
- if (svm_get_msr_feature(&msr_entry))
+ if (svm_get_feature_msr(ecx, &supported_de_cfg))
return 1;
- /* Check the supported bits */
- if (data & ~msr_entry.data)
+ if (data & ~supported_de_cfg)
return 1;
- /* Don't allow the guest to change a bit, #GP */
- if (!msr->host_initiated && (data ^ msr_entry.data))
+ /*
+ * Don't let the guest change the host-programmed value. The
+ * MSR is very model specific, i.e. contains multiple bits that
+ * are completely unknown to KVM, and the one bit known to KVM
+ * is simply a reflection of hardware capabilities.
+ */
+ if (!msr->host_initiated && data != svm->msr_decfg)
return 1;
svm->msr_decfg = data;
@@ -3304,7 +3357,9 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
+#ifdef CONFIG_KVM_AMD_SEV
[SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
+#endif
};
static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -3843,16 +3898,27 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
/*
- * KVM should never request an NMI window when vNMI is enabled, as KVM
- * allows at most one to-be-injected NMI and one pending NMI, i.e. if
- * two NMIs arrive simultaneously, KVM will inject one and set
- * V_NMI_PENDING for the other. WARN, but continue with the standard
- * single-step approach to try and salvage the pending NMI.
+ * If NMIs are outright masked, i.e. the vCPU is already handling an
+ * NMI, and KVM has not yet intercepted an IRET, then there is nothing
+ * more to do at this time as KVM has already enabled IRET intercepts.
+ * If KVM has already intercepted IRET, then single-step over the IRET,
+ * as NMIs aren't architecturally unmasked until the IRET completes.
+ *
+ * If vNMI is enabled, KVM should never request an NMI window if NMIs
+ * are masked, as KVM allows at most one to-be-injected NMI and one
+ * pending NMI. If two NMIs arrive simultaneously, KVM will inject one
+ * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are
+ * unmasked. KVM _will_ request an NMI window in some situations, e.g.
+ * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately
+ * inject the NMI. In those situations, KVM needs to single-step over
+ * the STI shadow or intercept STGI.
*/
- WARN_ON_ONCE(is_vnmi_enabled(svm));
+ if (svm_get_nmi_mask(vcpu)) {
+ WARN_ON_ONCE(is_vnmi_enabled(svm));
- if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
- return; /* IRET will cause a vm exit */
+ if (!svm->awaiting_iret_completion)
+ return; /* IRET will cause a vm exit */
+ }
/*
* SEV-ES guests are responsible for signaling when a vCPU is ready to
@@ -4085,17 +4151,29 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
+ if (to_kvm_sev_info(vcpu->kvm)->need_init)
+ return -EINVAL;
+
return 1;
}
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
if (is_guest_mode(vcpu))
return EXIT_FASTPATH_NONE;
- if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
- to_svm(vcpu)->vmcb->control.exit_info_1)
+ switch (svm->vmcb->control.exit_code) {
+ case SVM_EXIT_MSR:
+ if (!svm->vmcb->control.exit_info_1)
+ break;
return handle_fastpath_set_msr_irqoff(vcpu);
+ case SVM_EXIT_HLT:
+ return handle_fastpath_hlt(vcpu);
+ default:
+ break;
+ }
return EXIT_FASTPATH_NONE;
}
@@ -4331,11 +4409,11 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
/*
- * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that
+ * Intercept VMLOAD if the vCPU model is Intel in order to emulate that
* VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
* SVM on Intel is bonkers and extremely unlikely to work).
*/
- if (!guest_cpuid_is_intel(vcpu))
+ if (!guest_cpuid_is_intel_compatible(vcpu))
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
@@ -4554,12 +4632,6 @@ static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
vcpu->arch.at_instruction_boundary = true;
}
-static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
- if (!kvm_pause_in_guest(vcpu->kvm))
- shrink_ple_window(vcpu);
-}
-
static void svm_setup_mce(struct kvm_vcpu *vcpu)
{
/* [63:9] are reserved. */
@@ -4892,6 +4964,18 @@ static void svm_vm_destroy(struct kvm *kvm)
static int svm_vm_init(struct kvm *kvm)
{
+ int type = kvm->arch.vm_type;
+
+ if (type != KVM_X86_DEFAULT_VM &&
+ type != KVM_X86_SW_PROTECTED_VM) {
+ kvm->arch.has_protected_state =
+ (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM);
+ to_kvm_sev_info(kvm)->need_init = true;
+
+ kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM);
+ kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem;
+ }
+
if (!pause_filter_count || !pause_filter_thresh)
kvm->arch.pause_in_guest = true;
@@ -4906,7 +4990,7 @@ static int svm_vm_init(struct kvm *kvm)
static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
{
- struct page *page = snp_safe_alloc_page(vcpu);
+ struct page *page = snp_safe_alloc_page();
if (!page)
return NULL;
@@ -4920,8 +5004,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_processor_compatibility = svm_check_processor_compat,
.hardware_unsetup = svm_hardware_unsetup,
- .hardware_enable = svm_hardware_enable,
- .hardware_disable = svm_hardware_disable,
+ .enable_virtualization_cpu = svm_enable_virtualization_cpu,
+ .disable_virtualization_cpu = svm_disable_virtualization_cpu,
+ .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
.has_emulated_msr = svm_has_emulated_msr,
.vcpu_create = svm_vcpu_create,
@@ -4939,7 +5024,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_unblocking = avic_vcpu_unblocking,
.update_exception_bitmap = svm_update_exception_bitmap,
- .get_msr_feature = svm_get_msr_feature,
+ .get_feature_msr = svm_get_feature_msr,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
.get_segment_base = svm_get_segment_base,
@@ -4990,6 +5075,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
+
+ .x2apic_icr_is_split = true,
.set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
.apicv_post_state_restore = avic_apicv_post_state_restore,
@@ -5011,8 +5098,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_intercept = svm_check_intercept,
.handle_exit_irqoff = svm_handle_exit_irqoff,
- .sched_in = svm_sched_in,
-
.nested_ops = &svm_nested_ops,
.deliver_interrupt = svm_deliver_interrupt,
@@ -5026,6 +5111,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_smi_window = svm_enable_smi_window,
#endif
+#ifdef CONFIG_KVM_AMD_SEV
+ .dev_get_attr = sev_dev_get_attr,
.mem_enc_ioctl = sev_mem_enc_ioctl,
.mem_enc_register_region = sev_mem_enc_register_region,
.mem_enc_unregister_region = sev_mem_enc_unregister_region,
@@ -5033,7 +5120,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
.vm_move_enc_context_from = sev_vm_move_enc_context_from,
-
+#endif
.check_emulate_instruction = svm_check_emulate_instruction,
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
@@ -5044,6 +5131,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
.vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
.alloc_apic_backing_page = svm_alloc_apic_backing_page,
+
+ .gmem_prepare = sev_gmem_prepare,
+ .gmem_invalidate = sev_gmem_invalidate,
+ .private_max_mapping_level = sev_private_max_mapping_level,
};
/*
@@ -5160,6 +5251,9 @@ static __init void svm_set_cpu_caps(void)
/* CPUID 0x8000001F (SME/SEV features) */
sev_set_cpu_caps();
+
+ /* Don't advertise Bus Lock Detect to guest if SVM support is absent */
+ kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT);
}
static __init int svm_hardware_setup(void)
@@ -5187,7 +5281,7 @@ static __init int svm_hardware_setup(void)
iopm_va = page_address(iopm_pages);
memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
- iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+ iopm_base = __sme_page_pa(iopm_pages);
init_msrpm_offsets();
@@ -5249,6 +5343,12 @@ static __init int svm_hardware_setup(void)
nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
+ if (lbrv) {
+ if (!boot_cpu_has(X86_FEATURE_LBRV))
+ lbrv = false;
+ else
+ pr_info("LBR virtualization supported\n");
+ }
/*
* Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
* may be modified by svm_adjust_mmio_mask()), as well as nrips.
@@ -5302,14 +5402,6 @@ static __init int svm_hardware_setup(void)
svm_x86_ops.set_vnmi_pending = NULL;
}
-
- if (lbrv) {
- if (!boot_cpu_has(X86_FEATURE_LBRV))
- lbrv = false;
- else
- pr_info("LBR virtualization supported\n");
- }
-
if (!enable_pmu)
pr_info("PMU virtualization is disabled\n");
@@ -5348,8 +5440,6 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
static void __svm_exit(void)
{
kvm_x86_vendor_exit();
-
- cpu_emergency_unregister_virt_callback(svm_emergency_disable);
}
static int __init svm_init(void)
@@ -5365,8 +5455,6 @@ static int __init svm_init(void)
if (r)
return r;
- cpu_emergency_register_virt_callback(svm_emergency_disable);
-
/*
* Common KVM initialization _must_ come last, after this, /dev/kvm is
* exposed to userspace!
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 33878efdebc8..43fa6a16eb19 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -25,12 +25,26 @@
#include "cpuid.h"
#include "kvm_cache_regs.h"
-#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
+/*
+ * Helpers to convert to/from physical addresses for pages whose address is
+ * consumed directly by hardware. Even though it's a physical address, SVM
+ * often restricts the address to the natural width, hence 'unsigned long'
+ * instead of 'hpa_t'.
+ */
+static inline unsigned long __sme_page_pa(struct page *page)
+{
+ return __sme_set(page_to_pfn(page) << PAGE_SHIFT);
+}
+
+static inline struct page *__sme_pa_to_page(unsigned long pa)
+{
+ return pfn_to_page(__sme_clr(pa) >> PAGE_SHIFT);
+}
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
-#define MAX_DIRECT_ACCESS_MSRS 47
+#define MAX_DIRECT_ACCESS_MSRS 48
#define MSRPM_OFFSETS 32
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
@@ -39,6 +53,7 @@ extern int vgif;
extern bool intercept_smi;
extern bool x2avic_enabled;
extern bool vnmi;
+extern int lbrv;
/*
* Clean bits in VMCB.
@@ -79,17 +94,24 @@ enum {
struct kvm_sev_info {
bool active; /* SEV enabled guest */
bool es_active; /* SEV-ES enabled guest */
+ bool need_init; /* waiting for SEV_INIT2 */
unsigned int asid; /* ASID used for this guest */
unsigned int handle; /* SEV firmware handle */
int fd; /* SEV device fd */
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
+ u64 vmsa_features;
+ u16 ghcb_version; /* Highest guest GHCB protocol version allowed */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
struct list_head mirror_vms; /* List of VMs mirroring */
struct list_head mirror_entry; /* Use as a list entry of mirrors */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
+ void *snp_context; /* SNP guest context page */
+ void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */
+ void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */
+ struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
};
struct kvm_svm {
@@ -197,6 +219,7 @@ struct vcpu_sev_es_state {
u8 valid_bitmap[16];
struct kvm_host_map ghcb_map;
bool received_first_sipi;
+ unsigned int ap_reset_hold_type;
/* SEV-ES scratch area support */
u64 sw_scratch;
@@ -204,6 +227,18 @@ struct vcpu_sev_es_state {
u32 ghcb_sa_len;
bool ghcb_sa_sync;
bool ghcb_sa_free;
+
+ /* SNP Page-State-Change buffer entries currently being processed */
+ u16 psc_idx;
+ u16 psc_inflight;
+ bool psc_2m;
+
+ u64 ghcb_registered_gpa;
+
+ struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */
+ gpa_t snp_vmsa_gpa;
+ bool snp_ap_waiting_for_reset;
+ bool snp_has_guest_vmsa;
};
struct vcpu_svm {
@@ -300,7 +335,7 @@ struct svm_cpu_data {
u32 next_asid;
u32 min_asid;
- struct page *save_area;
+ struct vmcb *save_area;
unsigned long save_area_pa;
struct vmcb *current_vmcb;
@@ -318,6 +353,11 @@ static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
return container_of(kvm, struct kvm_svm, kvm);
}
+static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm)
+{
+ return &to_kvm_svm(kvm)->sev_info;
+}
+
static __always_inline bool sev_guest(struct kvm *kvm)
{
#ifdef CONFIG_KVM_AMD_SEV
@@ -340,6 +380,23 @@ static __always_inline bool sev_es_guest(struct kvm *kvm)
#endif
}
+static __always_inline bool sev_snp_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) &&
+ !WARN_ON_ONCE(!sev_es_guest(kvm));
+#else
+ return false;
+#endif
+}
+
+static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val)
+{
+ return svm->sev_es.ghcb_registered_gpa == val;
+}
+
static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
{
vmcb->control.clean = 0;
@@ -543,6 +600,7 @@ u32 *svm_vcpu_alloc_msrpm(void);
void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
void svm_vcpu_free_msrpm(u32 *msrpm);
void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
+void svm_enable_lbrv(struct kvm_vcpu *vcpu);
void svm_update_lbrv(struct kvm_vcpu *vcpu);
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
@@ -627,7 +685,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
#define AVIC_REQUIRED_APICV_INHIBITS \
( \
- BIT(APICV_INHIBIT_REASON_DISABLE) | \
+ BIT(APICV_INHIBIT_REASON_DISABLED) | \
BIT(APICV_INHIBIT_REASON_ABSENT) | \
BIT(APICV_INHIBIT_REASON_HYPERV) | \
BIT(APICV_INHIBIT_REASON_NESTED) | \
@@ -664,13 +722,16 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
/* sev.c */
-#define GHCB_VERSION_MAX 1ULL
-#define GHCB_VERSION_MIN 1ULL
-
-
-extern unsigned int max_sev_asid;
+void pre_sev_run(struct vcpu_svm *svm, int cpu);
+void sev_init_vmcb(struct vcpu_svm *svm);
+void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
+void sev_es_vcpu_reset(struct vcpu_svm *svm);
+void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa);
+void sev_es_unmap_ghcb(struct vcpu_svm *svm);
-void sev_vm_destroy(struct kvm *kvm);
+#ifdef CONFIG_KVM_AMD_SEV
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp);
int sev_mem_enc_register_region(struct kvm *kvm,
struct kvm_enc_region *range);
@@ -679,22 +740,61 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
void sev_guest_memory_reclaimed(struct kvm *kvm);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
-void pre_sev_run(struct vcpu_svm *svm, int cpu);
+/* These symbols are used in common code and are stubbed below. */
+
+struct page *snp_safe_alloc_page_node(int node, gfp_t gfp);
+static inline struct page *snp_safe_alloc_page(void)
+{
+ return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT);
+}
+
+void sev_free_vcpu(struct kvm_vcpu *vcpu);
+void sev_vm_destroy(struct kvm *kvm);
void __init sev_set_cpu_caps(void);
void __init sev_hardware_setup(void);
void sev_hardware_unsetup(void);
int sev_cpu_init(struct svm_cpu_data *sd);
-void sev_init_vmcb(struct vcpu_svm *svm);
-void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
-void sev_free_vcpu(struct kvm_vcpu *vcpu);
-int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
-int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
-void sev_es_vcpu_reset(struct vcpu_svm *svm);
-void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
-void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
-void sev_es_unmap_ghcb(struct vcpu_svm *svm);
-struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
+int sev_dev_get_attr(u32 group, u64 attr, u64 *val);
+extern unsigned int max_sev_asid;
+void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
+void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
+int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
+void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
+int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
+#else
+static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
+{
+ return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+}
+
+static inline struct page *snp_safe_alloc_page(void)
+{
+ return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT);
+}
+
+static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {}
+static inline void sev_vm_destroy(struct kvm *kvm) {}
+static inline void __init sev_set_cpu_caps(void) {}
+static inline void __init sev_hardware_setup(void) {}
+static inline void sev_hardware_unsetup(void) {}
+static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; }
+static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; }
+#define max_sev_asid 0
+static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {}
+static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {}
+static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
+{
+ return 0;
+}
+static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {}
+static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ return 0;
+}
+
+#endif
/* vmenter.S */
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index a0c8eb37d3e1..2ed80aea3bb1 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -209,10 +209,8 @@ SYM_FUNC_START(__svm_vcpu_run)
7: vmload %_ASM_AX
8:
-#ifdef CONFIG_MITIGATION_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
- FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
-#endif
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
/* Clobbers RAX, RCX, RDX. */
RESTORE_HOST_SPEC_CTRL
@@ -348,10 +346,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
2: cli
-#ifdef CONFIG_MITIGATION_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
- FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
-#endif
+ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
/* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */
RESTORE_HOST_SPEC_CTRL
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index c6b4b1728006..d3aeffd6ae75 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -314,12 +314,12 @@ TRACE_EVENT(name, \
__entry->guest_rip = kvm_rip_read(vcpu); \
__entry->isa = isa; \
__entry->vcpu_id = vcpu->vcpu_id; \
- static_call(kvm_x86_get_exit_info)(vcpu, \
- &__entry->exit_reason, \
- &__entry->info1, \
- &__entry->info2, \
- &__entry->intr_info, \
- &__entry->error_code); \
+ kvm_x86_call(get_exit_info)(vcpu, \
+ &__entry->exit_reason, \
+ &__entry->info1, \
+ &__entry->info2, \
+ &__entry->intr_info, \
+ &__entry->error_code); \
), \
\
TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \
@@ -828,7 +828,8 @@ TRACE_EVENT(kvm_emulate_insn,
),
TP_fast_assign(
- __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS);
+ __entry->csbase = kvm_x86_call(get_segment_base)(vcpu,
+ VCPU_SREG_CS);
__entry->len = vcpu->arch.emulate_ctxt->fetch.ptr
- vcpu->arch.emulate_ctxt->fetch.data;
__entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len;
@@ -1074,7 +1075,7 @@ TRACE_EVENT(kvm_smm_transition,
);
/*
- * Tracepoint for VT-d posted-interrupts.
+ * Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC.
*/
TRACE_EVENT(kvm_pi_irte_update,
TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
@@ -1100,7 +1101,7 @@ TRACE_EVENT(kvm_pi_irte_update,
__entry->set = set;
),
- TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
+ TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
"gvec: 0x%x, pi_desc_addr: 0x%llx",
__entry->set ? "enabled and being updated" : "disabled",
__entry->host_irq,
@@ -1375,6 +1376,10 @@ TRACE_EVENT(kvm_hv_stimer_cleanup,
__entry->vcpu_id, __entry->timer_index)
);
+#define kvm_print_apicv_inhibit_reasons(inhibits) \
+ (inhibits), (inhibits) ? " " : "", \
+ (inhibits) ? __print_flags(inhibits, "|", APICV_INHIBIT_REASONS) : ""
+
TRACE_EVENT(kvm_apicv_inhibit_changed,
TP_PROTO(int reason, bool set, unsigned long inhibits),
TP_ARGS(reason, set, inhibits),
@@ -1391,9 +1396,10 @@ TRACE_EVENT(kvm_apicv_inhibit_changed,
__entry->inhibits = inhibits;
),
- TP_printk("%s reason=%u, inhibits=0x%lx",
+ TP_printk("%s reason=%u, inhibits=0x%lx%s%s",
__entry->set ? "set" : "cleared",
- __entry->reason, __entry->inhibits)
+ __entry->reason,
+ kvm_print_apicv_inhibit_reasons(__entry->inhibits))
);
TRACE_EVENT(kvm_apicv_accept_irq,
@@ -1678,7 +1684,7 @@ TRACE_EVENT(kvm_nested_vmenter_failed,
),
TP_fast_assign(
- __assign_str(msg, msg);
+ __assign_str(msg);
__entry->err = err;
),
@@ -1834,6 +1840,37 @@ TRACE_EVENT(kvm_vmgexit_msr_protocol_exit,
__entry->vcpu_id, __entry->ghcb_gpa, __entry->result)
);
+/*
+ * Tracepoint for #NPFs due to RMP faults.
+ */
+TRACE_EVENT(kvm_rmp_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, u64 gpa, u64 pfn, u64 error_code,
+ int rmp_level, int psmash_ret),
+ TP_ARGS(vcpu, gpa, pfn, error_code, rmp_level, psmash_ret),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(u64, gpa)
+ __field(u64, pfn)
+ __field(u64, error_code)
+ __field(int, rmp_level)
+ __field(int, psmash_ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->gpa = gpa;
+ __entry->pfn = pfn;
+ __entry->error_code = error_code;
+ __entry->rmp_level = rmp_level;
+ __entry->psmash_ret = psmash_ret;
+ ),
+
+ TP_printk("vcpu %u gpa %016llx pfn 0x%llx error_code 0x%llx rmp_level %d psmash_ret %d",
+ __entry->vcpu_id, __entry->gpa, __entry->pfn,
+ __entry->error_code, __entry->rmp_level, __entry->psmash_ret)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 41a4533f9989..cb6588238f46 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -54,9 +54,7 @@ struct nested_vmx_msrs {
};
struct vmcs_config {
- int size;
- u32 basic_cap;
- u32 revision_id;
+ u64 basic;
u32 pin_based_exec_ctrl;
u32 cpu_based_exec_ctrl;
u32 cpu_based_2nd_exec_ctrl;
@@ -76,7 +74,7 @@ extern struct vmx_capability vmx_capability __ro_after_init;
static inline bool cpu_has_vmx_basic_inout(void)
{
- return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+ return vmcs_config.basic & VMX_BASIC_INOUT;
}
static inline bool cpu_has_virtual_nmis(void)
@@ -225,7 +223,7 @@ static inline bool cpu_has_vmx_vmfunc(void)
static inline bool cpu_has_vmx_shadow_vmcs(void)
{
/* check if the cpu supports writing r/o exit information fields */
- if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
+ if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
return false;
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -367,7 +365,7 @@ static inline bool cpu_has_vmx_invvpid_global(void)
static inline bool cpu_has_vmx_intel_pt(void)
{
- return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) &&
+ return (vmcs_config.misc & VMX_MISC_INTEL_PT) &&
(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) &&
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
}
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
new file mode 100644
index 000000000000..7668e2fb8043
--- /dev/null
+++ b/arch/x86/kvm/vmx/main.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/moduleparam.h>
+
+#include "x86_ops.h"
+#include "vmx.h"
+#include "nested.h"
+#include "pmu.h"
+#include "posted_intr.h"
+
+#define VMX_REQUIRED_APICV_INHIBITS \
+ (BIT(APICV_INHIBIT_REASON_DISABLED) | \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED))
+
+struct kvm_x86_ops vt_x86_ops __initdata = {
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = vmx_check_processor_compat,
+
+ .hardware_unsetup = vmx_hardware_unsetup,
+
+ .enable_virtualization_cpu = vmx_enable_virtualization_cpu,
+ .disable_virtualization_cpu = vmx_disable_virtualization_cpu,
+ .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,
+
+ .has_emulated_msr = vmx_has_emulated_msr,
+
+ .vm_size = sizeof(struct kvm_vmx),
+ .vm_init = vmx_vm_init,
+ .vm_destroy = vmx_vm_destroy,
+
+ .vcpu_precreate = vmx_vcpu_precreate,
+ .vcpu_create = vmx_vcpu_create,
+ .vcpu_free = vmx_vcpu_free,
+ .vcpu_reset = vmx_vcpu_reset,
+
+ .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
+ .update_exception_bitmap = vmx_update_exception_bitmap,
+ .get_feature_msr = vmx_get_feature_msr,
+ .get_msr = vmx_get_msr,
+ .set_msr = vmx_set_msr,
+ .get_segment_base = vmx_get_segment_base,
+ .get_segment = vmx_get_segment,
+ .set_segment = vmx_set_segment,
+ .get_cpl = vmx_get_cpl,
+ .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .is_valid_cr0 = vmx_is_valid_cr0,
+ .set_cr0 = vmx_set_cr0,
+ .is_valid_cr4 = vmx_is_valid_cr4,
+ .set_cr4 = vmx_set_cr4,
+ .set_efer = vmx_set_efer,
+ .get_idt = vmx_get_idt,
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+ .set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+ .cache_reg = vmx_cache_reg,
+ .get_rflags = vmx_get_rflags,
+ .set_rflags = vmx_set_rflags,
+ .get_if_flag = vmx_get_if_flag,
+
+ .flush_tlb_all = vmx_flush_tlb_all,
+ .flush_tlb_current = vmx_flush_tlb_current,
+ .flush_tlb_gva = vmx_flush_tlb_gva,
+ .flush_tlb_guest = vmx_flush_tlb_guest,
+
+ .vcpu_pre_run = vmx_vcpu_pre_run,
+ .vcpu_run = vmx_vcpu_run,
+ .handle_exit = vmx_handle_exit,
+ .skip_emulated_instruction = vmx_skip_emulated_instruction,
+ .update_emulated_instruction = vmx_update_emulated_instruction,
+ .set_interrupt_shadow = vmx_set_interrupt_shadow,
+ .get_interrupt_shadow = vmx_get_interrupt_shadow,
+ .patch_hypercall = vmx_patch_hypercall,
+ .inject_irq = vmx_inject_irq,
+ .inject_nmi = vmx_inject_nmi,
+ .inject_exception = vmx_inject_exception,
+ .cancel_injection = vmx_cancel_injection,
+ .interrupt_allowed = vmx_interrupt_allowed,
+ .nmi_allowed = vmx_nmi_allowed,
+ .get_nmi_mask = vmx_get_nmi_mask,
+ .set_nmi_mask = vmx_set_nmi_mask,
+ .enable_nmi_window = vmx_enable_nmi_window,
+ .enable_irq_window = vmx_enable_irq_window,
+ .update_cr8_intercept = vmx_update_cr8_intercept,
+
+ .x2apic_icr_is_split = false,
+ .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
+ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
+ .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
+ .hwapic_irr_update = vmx_hwapic_irr_update,
+ .hwapic_isr_update = vmx_hwapic_isr_update,
+ .sync_pir_to_irr = vmx_sync_pir_to_irr,
+ .deliver_interrupt = vmx_deliver_interrupt,
+ .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
+
+ .set_tss_addr = vmx_set_tss_addr,
+ .set_identity_map_addr = vmx_set_identity_map_addr,
+ .get_mt_mask = vmx_get_mt_mask,
+
+ .get_exit_info = vmx_get_exit_info,
+
+ .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
+ .write_tsc_offset = vmx_write_tsc_offset,
+ .write_tsc_multiplier = vmx_write_tsc_multiplier,
+
+ .load_mmu_pgd = vmx_load_mmu_pgd,
+
+ .check_intercept = vmx_check_intercept,
+ .handle_exit_irqoff = vmx_handle_exit_irqoff,
+
+ .cpu_dirty_log_size = PML_ENTITY_NUM,
+ .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
+
+ .nested_ops = &vmx_nested_ops,
+
+ .pi_update_irte = vmx_pi_update_irte,
+ .pi_start_assignment = vmx_pi_start_assignment,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+ .setup_mce = vmx_setup_mce,
+
+#ifdef CONFIG_KVM_SMM
+ .smi_allowed = vmx_smi_allowed,
+ .enter_smm = vmx_enter_smm,
+ .leave_smm = vmx_leave_smm,
+ .enable_smi_window = vmx_enable_smi_window,
+#endif
+
+ .check_emulate_instruction = vmx_check_emulate_instruction,
+ .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+ .migrate_timers = vmx_migrate_timers,
+
+ .msr_filter_changed = vmx_msr_filter_changed,
+ .complete_emulated_msr = kvm_complete_insn_gp,
+
+ .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
+
+ .get_untagged_addr = vmx_get_untagged_addr,
+};
+
+struct kvm_x86_init_ops vt_init_ops __initdata = {
+ .hardware_setup = vmx_hardware_setup,
+ .handle_intel_pt_intr = NULL,
+
+ .runtime_ops = &vt_x86_ops,
+ .pmu_ops = &intel_pmu_ops,
+};
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d05ddf751491..a8e7bc04d9bf 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -12,6 +12,7 @@
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
+#include "posted_intr.h"
#include "sgx.h"
#include "trace.h"
#include "vmx.h"
@@ -409,18 +410,40 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qualification;
u32 vm_exit_reason;
- unsigned long exit_qualification = vcpu->arch.exit_qualification;
if (vmx->nested.pml_full) {
vm_exit_reason = EXIT_REASON_PML_FULL;
vmx->nested.pml_full = false;
- exit_qualification &= INTR_INFO_UNBLOCK_NMI;
+
+ /*
+ * It should be impossible to trigger a nested PML Full VM-Exit
+ * for anything other than an EPT Violation from L2. KVM *can*
+ * trigger nEPT page fault injection in response to an EPT
+ * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT
+ * tables also changed, but KVM should not treat EPT Misconfig
+ * VM-Exits as writes.
+ */
+ WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION);
+
+ /*
+ * PML Full and EPT Violation VM-Exits both use bit 12 to report
+ * "NMI unblocking due to IRET", i.e. the bit can be propagated
+ * as-is from the original EXIT_QUALIFICATION.
+ */
+ exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI;
} else {
- if (fault->error_code & PFERR_RSVD_MASK)
+ if (fault->error_code & PFERR_RSVD_MASK) {
vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
- else
+ exit_qualification = 0;
+ } else {
+ exit_qualification = fault->exit_qualification;
+ exit_qualification |= vmx_get_exit_qual(vcpu) &
+ (EPT_VIOLATION_GVA_IS_VALID |
+ EPT_VIOLATION_GVA_TRANSLATED);
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ }
/*
* Although the caller (kvm_inject_emulated_page_fault) would
@@ -958,7 +981,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
__func__, i, e.index, e.reserved);
goto fail;
}
- if (kvm_set_msr(vcpu, e.index, e.value)) {
+ if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) {
pr_debug_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
__func__, i, e.index, e.value);
@@ -994,7 +1017,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
}
}
- if (kvm_get_msr(vcpu, msr_index, data)) {
+ if (kvm_get_msr_with_filter(vcpu, msr_index, data)) {
pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
msr_index);
return false;
@@ -1089,9 +1112,9 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
/*
* Emulated VMEntry does not fail here. Instead a less
* accurate value will be returned by
- * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
- * instead of reading the value from the vmcs02 VMExit
- * MSR-store area.
+ * nested_vmx_get_vmexit_msr_value() by reading KVM's
+ * internal MSR state instead of reading the value from
+ * the vmcs02 VMExit MSR-store area.
*/
pr_warn_ratelimited(
"Not enough msr entries in msr_autostore. Can't add msr %x\n",
@@ -1228,21 +1251,32 @@ static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
{
- const u64 feature_and_reserved =
- /* feature (except bit 48; see below) */
- BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
- /* reserved */
- BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
+ const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
+ VMX_BASIC_INOUT |
+ VMX_BASIC_TRUE_CTLS;
+
+ const u64 reserved_bits = GENMASK_ULL(63, 56) |
+ GENMASK_ULL(47, 45) |
+ BIT_ULL(31);
+
u64 vmx_basic = vmcs_config.nested.basic;
- if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
+ BUILD_BUG_ON(feature_bits & reserved_bits);
+
+ /*
+ * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has
+ * inverted polarity), the incoming value must not set feature bits or
+ * reserved bits that aren't allowed/supported by KVM. Fields, i.e.
+ * multi-bit values, are explicitly checked below.
+ */
+ if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits))
return -EINVAL;
/*
* KVM does not emulate a version of VMX that constrains physical
* addresses of VMX structures (e.g. VMCS) to 32-bits.
*/
- if (data & BIT_ULL(48))
+ if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
return -EINVAL;
if (vmx_basic_vmcs_revision_id(vmx_basic) !=
@@ -1311,16 +1345,29 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
{
- const u64 feature_and_reserved_bits =
- /* feature */
- BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
- BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
- /* reserved */
- GENMASK_ULL(13, 9) | BIT_ULL(31);
+ const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA |
+ VMX_MISC_ACTIVITY_HLT |
+ VMX_MISC_ACTIVITY_SHUTDOWN |
+ VMX_MISC_ACTIVITY_WAIT_SIPI |
+ VMX_MISC_INTEL_PT |
+ VMX_MISC_RDMSR_IN_SMM |
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_VMXOFF_BLOCK_SMI |
+ VMX_MISC_ZERO_LEN_INS;
+
+ const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9);
+
u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
vmcs_config.nested.misc_high);
- if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
+ BUILD_BUG_ON(feature_bits & reserved_bits);
+
+ /*
+ * The incoming value must not set feature bits or reserved bits that
+ * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are
+ * explicitly checked below.
+ */
+ if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits))
return -EINVAL;
if ((vmx->nested.msrs.pinbased_ctls_high &
@@ -2220,6 +2267,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
vmcs_write64(EPT_POINTER,
construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
+
/* All VMFUNCs are currently emulated through L0 vmexits. */
if (cpu_has_vmx_vmfunc())
vmcs_write64(VM_FUNCTION_CONTROL, 0);
@@ -2291,10 +2341,12 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
/* Posted interrupts setting is only taken from vmcs12. */
vmx->nested.pi_pending = false;
- if (nested_cpu_has_posted_intr(vmcs12))
+ if (nested_cpu_has_posted_intr(vmcs12)) {
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- else
+ } else {
+ vmx->nested.posted_intr_nv = -1;
exec_control &= ~PIN_BASED_POSTED_INTR;
+ }
pin_controls_set(vmx, exec_control);
/*
@@ -2400,7 +2452,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
if (cpu_has_load_ia32_efer()) {
if (guest_efer & EFER_LMA)
exec_control |= VM_ENTRY_IA32E_MODE;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
exec_control |= VM_ENTRY_LOAD_IA32_EFER;
}
vm_entry_controls_set(vmx, exec_control);
@@ -2413,7 +2465,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
*/
exec_control = __vm_exit_controls_get(vmcs01);
- if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
+ if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER;
else
exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
@@ -2444,6 +2496,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
@@ -2481,7 +2534,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
- vmx->segment_cache.bitmask = 0;
+ vmx_segment_cache_clear(vmx);
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@ -3874,8 +3927,8 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
return 0;
- max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
- if (max_irr != 256) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0) {
vapic_page = vmx->nested.virtual_apic_map.hva;
if (!vapic_page)
goto mmio_needed;
@@ -4006,10 +4059,46 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->nested.preemption_timer_expired;
}
-static bool vmx_has_nested_events(struct kvm_vcpu *vcpu)
+static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
{
- return nested_vmx_preemption_timer_pending(vcpu) ||
- to_vmx(vcpu)->nested.mtf_pending;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic = vmx->nested.virtual_apic_map.hva;
+ int max_irr, vppr;
+
+ if (nested_vmx_preemption_timer_pending(vcpu) ||
+ vmx->nested.mtf_pending)
+ return true;
+
+ /*
+ * Virtual Interrupt Delivery doesn't require manual injection. Either
+ * the interrupt is already in GUEST_RVI and will be recognized by CPU
+ * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move
+ * the interrupt from the PIR to RVI prior to entering the guest.
+ */
+ if (for_injection)
+ return false;
+
+ if (!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ __vmx_interrupt_blocked(vcpu))
+ return false;
+
+ if (!vapic)
+ return false;
+
+ vppr = *((u32 *)(vapic + APIC_PROCPRI));
+
+ max_irr = vmx_get_rvi();
+ if ((max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+
+ if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
+ pi_test_on(vmx->nested.pi_desc)) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+ }
+
+ return false;
}
/*
@@ -4222,11 +4311,52 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
}
if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
+ int irq;
+
if (block_nested_events)
return -EBUSY;
if (!nested_exit_on_intr(vcpu))
goto no_vmexit;
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+
+ if (!nested_exit_intr_ack_set(vcpu)) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ return 0;
+ }
+
+ irq = kvm_cpu_get_extint(vcpu);
+ if (irq != -1) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+ return 0;
+ }
+
+ irq = kvm_apic_has_interrupt(vcpu);
+ if (WARN_ON_ONCE(irq < 0))
+ goto no_vmexit;
+
+ /*
+ * If the IRQ is L2's PI notification vector, process posted
+ * interrupts for L2 instead of injecting VM-Exit, as the
+ * detection/morphing architecturally occurs when the IRQ is
+ * delivered to the CPU. Note, only interrupts that are routed
+ * through the local APIC trigger posted interrupt processing,
+ * and enabling posted interrupts requires ACK-on-exit.
+ */
+ if (irq == vmx->nested.posted_intr_nv) {
+ vmx->nested.pi_pending = true;
+ kvm_apic_clear_irr(vcpu, irq);
+ goto no_vmexit;
+ }
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+
+ /*
+ * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
+ * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
+ * if APICv is active.
+ */
+ kvm_apic_ack_interrupt(vcpu, irq);
return 0;
}
@@ -4640,7 +4770,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
return vmcs_read64(GUEST_IA32_EFER);
if (cpu_has_load_ia32_efer())
- return host_efer;
+ return kvm_host.efer;
for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
@@ -4651,7 +4781,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
if (efer_msr)
return efer_msr->data;
- return host_efer;
+ return kvm_host.efer;
}
static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
@@ -4744,7 +4874,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
goto vmabort;
}
- if (kvm_set_msr(vcpu, h.index, h.value)) {
+ if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) {
pr_debug_ratelimited(
"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
__func__, j, h.index, h.value);
@@ -4907,14 +5037,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (likely(!vmx->fail)) {
- if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
- nested_exit_intr_ack_set(vcpu)) {
- int irq = kvm_cpu_get_interrupt(vcpu);
- WARN_ON(irq < 0);
- vmcs12->vm_exit_intr_info = irq |
- INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
- }
-
if (vm_exit_reason != -1)
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
vmcs12->exit_qualification,
@@ -6208,6 +6330,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
else if (is_alignment_check(intr_info) &&
!vmx_guest_inject_ac(vcpu))
return true;
+ else if (is_ve_fault(intr_info))
+ return true;
return false;
case EXIT_REASON_EXTERNAL_INTERRUPT:
return true;
@@ -6987,7 +7111,7 @@ static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
{
msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
msrs->misc_low |=
- MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
VMX_MISC_ACTIVITY_HLT |
VMX_MISC_ACTIVITY_WAIT_SIPI;
@@ -7002,12 +7126,10 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
* guest, and the VMCS structure we give it - not about the
* VMX support of the underlying hardware.
*/
- msrs->basic =
- VMCS12_REVISION |
- VMX_BASIC_TRUE_CTLS |
- ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
- (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+ msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE,
+ X86_MEMTYPE_WB);
+ msrs->basic |= VMX_BASIC_TRUE_CTLS;
if (cpu_has_vmx_basic_inout())
msrs->basic |= VMX_BASIC_INOUT;
}
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index cce4e2aa30fb..2c296b6abb8c 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -39,11 +39,17 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+ !refcount_read(&vcpu->kvm->users_count));
+
return to_vmx(vcpu)->nested.cached_vmcs12;
}
static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+ !refcount_read(&vcpu->kvm->users_count));
+
return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
}
@@ -109,7 +115,7 @@ static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
{
return to_vmx(vcpu)->nested.msrs.misc_low &
- MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
}
static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index be40474de6e4..83382a4d1d66 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -348,14 +348,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
- if (data & pmu->fixed_ctr_ctrl_mask)
+ if (data & pmu->fixed_ctr_ctrl_rsvd)
return 1;
if (pmu->fixed_ctr_ctrl != data)
reprogram_fixed_counters(pmu, data);
break;
case MSR_IA32_PEBS_ENABLE:
- if (data & pmu->pebs_enable_mask)
+ if (data & pmu->pebs_enable_rsvd)
return 1;
if (pmu->pebs_enable != data) {
@@ -371,7 +371,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pmu->ds_area = data;
break;
case MSR_PEBS_DATA_CFG:
- if (data & pmu->pebs_data_cfg_mask)
+ if (data & pmu->pebs_data_cfg_rsvd)
return 1;
pmu->pebs_data_cfg = data;
@@ -436,8 +436,8 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index)
};
u64 eventsel;
- BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_PMC_MAX_FIXED);
- BUILD_BUG_ON(index >= KVM_PMC_MAX_FIXED);
+ BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUTNERS);
+ BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUTNERS);
/*
* Yell if perf reports support for a fixed counter but perf doesn't
@@ -448,6 +448,14 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index)
return eventsel;
}
+static void intel_pmu_enable_fixed_counter_bits(struct kvm_pmu *pmu, u64 bits)
+{
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
+ pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits);
+}
+
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -456,8 +464,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
union cpuid10_eax eax;
union cpuid10_edx edx;
u64 perf_capabilities;
- u64 counter_mask;
- int i;
+ u64 counter_rsvd;
memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
@@ -501,22 +508,24 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
((u64)1 << edx.split.bit_width_fixed) - 1;
}
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
- pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
- counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
+ intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL |
+ INTEL_FIXED_0_USER |
+ INTEL_FIXED_0_ENABLE_PMI);
+
+ counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX));
- pmu->global_ctrl_mask = counter_mask;
+ pmu->global_ctrl_rsvd = counter_rsvd;
/*
* GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET)
* share reserved bit definitions. The kernel just happens to use
* OVF_CTRL for the names.
*/
- pmu->global_status_mask = pmu->global_ctrl_mask
+ pmu->global_status_rsvd = pmu->global_ctrl_rsvd
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
if (vmx_pt_mode_is_host_guest())
- pmu->global_status_mask &=
+ pmu->global_status_rsvd &=
~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
entry = kvm_find_cpuid_entry_index(vcpu, 7, 0);
@@ -544,15 +553,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
if (perf_capabilities & PERF_CAP_PEBS_FORMAT) {
if (perf_capabilities & PERF_CAP_PEBS_BASELINE) {
- pmu->pebs_enable_mask = counter_mask;
+ pmu->pebs_enable_rsvd = counter_rsvd;
pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
- pmu->fixed_ctr_ctrl_mask &=
- ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4));
- }
- pmu->pebs_data_cfg_mask = ~0xff00000full;
+ pmu->pebs_data_cfg_rsvd = ~0xff00000full;
+ intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE);
} else {
- pmu->pebs_enable_mask =
+ pmu->pebs_enable_rsvd =
~((1ull << pmu->nr_arch_gp_counters) - 1);
}
}
@@ -564,14 +570,14 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
- for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) {
+ for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
pmu->gp_counters[i].current_config = 0;
}
- for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
+ for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUTNERS; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX;
@@ -731,6 +737,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
.deliver_pmi = intel_pmu_deliver_pmi,
.cleanup = intel_pmu_cleanup,
.EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
- .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
+ .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS,
.MIN_NR_GP_COUNTERS = 1,
};
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index af662312fd07..ec08fa3caf43 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -107,7 +107,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
* handle task migration (@cpu != vcpu->cpu).
*/
new.ndst = dest;
- new.sn = 0;
+ __pi_clear_sn(&new);
/*
* Restore the notification vector; in the blocking case, the
@@ -157,7 +157,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
&per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
- WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
+ WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking");
old.control = READ_ONCE(pi_desc->control);
do {
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 26992076552e..1715d2ab07be 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -2,97 +2,8 @@
#ifndef __KVM_X86_VMX_POSTED_INTR_H
#define __KVM_X86_VMX_POSTED_INTR_H
-#define POSTED_INTR_ON 0
-#define POSTED_INTR_SN 1
-
-#define PID_TABLE_ENTRY_VALID 1
-
-/* Posted-Interrupt Descriptor */
-struct pi_desc {
- u32 pir[8]; /* Posted interrupt requested */
- union {
- struct {
- /* bit 256 - Outstanding Notification */
- u16 on : 1,
- /* bit 257 - Suppress Notification */
- sn : 1,
- /* bit 271:258 - Reserved */
- rsvd_1 : 14;
- /* bit 279:272 - Notification Vector */
- u8 nv;
- /* bit 287:280 - Reserved */
- u8 rsvd_2;
- /* bit 319:288 - Notification Destination */
- u32 ndst;
- };
- u64 control;
- };
- u32 rsvd[6];
-} __aligned(64);
-
-static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
-{
- return test_and_set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
-{
- return test_and_clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
-{
- return test_and_clear_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
-{
- return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
-}
-
-static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
-{
- return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
-}
-
-static inline void pi_set_sn(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_set_on(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_on(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_sn(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_on(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_sn(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
+#include <linux/find.h>
+#include <asm/posted_intr.h>
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
@@ -103,4 +14,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void vmx_pi_start_assignment(struct kvm *kvm);
+static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
+{
+ int vec;
+
+ vec = find_last_bit((unsigned long *)pi_desc->pir, 256);
+ return vec < 256 ? vec : -1;
+}
+
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 6fef01e0536e..a3c3d2a51f47 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -274,7 +274,7 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
* simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
* enforce restriction of access to the PROVISIONKEY.
*/
- contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL);
if (!contents)
return -ENOMEM;
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 7c1996b433e2..b25625314658 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -140,6 +140,11 @@ static inline bool is_nm_fault(u32 intr_info)
return is_exception_n(intr_info, NM_VECTOR);
}
+static inline bool is_ve_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, VE_VECTOR);
+}
+
/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 01936013428b..56fd150a6f24 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -188,12 +188,13 @@ struct __packed vmcs12 {
};
/*
- * VMCS12_REVISION is an arbitrary id that should be changed if the content or
- * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
- * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ * VMCS12_REVISION is KVM's arbitrary ID for the layout of struct vmcs12. KVM
+ * enumerates this value to L1 via MSR_IA32_VMX_BASIC, and checks the revision
+ * ID during nested VMPTRLD to verify that L1 is loading a VMCS that adhere's
+ * to KVM's virtual CPU definition.
*
- * IMPORTANT: Changing this value will break save/restore compatibility with
- * older kvm releases.
+ * DO NOT change this value, as it will break save/restore compatibility with
+ * older KVM releases.
*/
#define VMCS12_REVISION 0x11e57ed0
@@ -206,7 +207,8 @@ struct __packed vmcs12 {
#define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE
/*
- * For save/restore compatibility, the vmcs12 field offsets must not change.
+ * For save/restore compatibility, the vmcs12 field offsets must not change,
+ * although appending fields and/or filling gaps is obviously allowed.
*/
#define CHECK_OFFSET(field, loc) \
ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 22411f4aff53..1a4438358c5e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -68,10 +68,13 @@
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
+#include "x86_ops.h"
#include "smm.h"
#include "vmx_onhyperv.h"
+#include "posted_intr.h"
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
#ifdef MODULE
@@ -257,7 +260,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
- if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
return 0;
}
@@ -402,7 +405,7 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
* and VM-Exit.
*/
vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
- (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
!boot_cpu_has_bug(X86_BUG_MDS) &&
!boot_cpu_has_bug(X86_BUG_TAA);
@@ -522,16 +525,10 @@ static const struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
-static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
-{
- vmx->segment_cache.bitmask = 0;
-}
static unsigned long host_idt_base;
#if IS_ENABLED(CONFIG_HYPERV)
-static struct kvm_x86_ops vmx_x86_ops __initdata;
-
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
@@ -581,9 +578,8 @@ static __init void hv_init_evmcs(void)
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_l2_tlb_flush
+ vt_x86_ops.enable_l2_tlb_flush
= hv_enable_l2_tlb_flush;
-
} else {
enlightened_vmcs = false;
}
@@ -755,7 +751,7 @@ fault:
return -EIO;
}
-static void vmx_emergency_disable(void)
+void vmx_emergency_disable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
@@ -874,6 +870,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
/*
+ * #VE isn't used for VMX. To test against unexpected changes
+ * related to #VE for VMX, intercept unexpected #VE and warn on it.
+ */
+ if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ eb |= 1u << VE_VECTOR;
+ /*
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
@@ -1118,12 +1120,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
* atomically, since it's faster than switching it manually.
*/
if (cpu_has_load_ia32_efer() ||
- (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
+ (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
add_atomic_switch_msr(vmx, MSR_EFER,
- guest_efer, host_efer, false);
+ guest_efer, kvm_host.efer, false);
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
@@ -1136,7 +1138,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
clear_atomic_switch_msr(vmx, MSR_EFER);
guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
+ guest_efer |= kvm_host.efer & ignore_bits;
vmx->guest_uret_msrs[i].data = guest_efer;
vmx->guest_uret_msrs[i].mask = ~ignore_bits;
@@ -1406,6 +1408,38 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
}
#endif
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy)
{
@@ -1477,10 +1511,13 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
vmx_vcpu_pi_load(vcpu, cpu);
@@ -1488,7 +1525,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vmx->host_debugctlmsr = get_debugctlmsr();
}
-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);
@@ -1547,7 +1584,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmx->emulation_required = vmx_emulation_required(vcpu);
}
-static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
{
return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
}
@@ -1653,8 +1690,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
@@ -1738,7 +1775,7 @@ rip_updated:
* Recognizes a pending MTF VM-exit and records the nested state for later
* delivery.
*/
-static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1769,7 +1806,7 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
}
}
-static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
vmx_update_emulated_instruction(vcpu);
return skip_emulated_instruction(vcpu);
@@ -1788,7 +1825,7 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
-static void vmx_inject_exception(struct kvm_vcpu *vcpu)
+void vmx_inject_exception(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
@@ -1909,12 +1946,12 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
return kvm_caps.default_tsc_scaling_ratio;
}
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
{
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
}
-static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
}
@@ -1957,15 +1994,15 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
return !(msr->data & ~valid_bits);
}
-static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+int vmx_get_feature_msr(u32 msr, u64 *data)
{
- switch (msr->index) {
+ switch (msr) {
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested)
return 1;
- return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
default:
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
}
@@ -1974,7 +2011,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
@@ -2155,7 +2192,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
@@ -2458,7 +2495,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return ret;
}
-static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
unsigned long guest_owned_bits;
@@ -2520,17 +2557,15 @@ static bool cpu_has_sgx(void)
*/
static bool cpu_has_perf_global_ctrl_bug(void)
{
- if (boot_cpu_data.x86 == 0x6) {
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_NEHALEM_EP: /* AAK155 */
- case INTEL_FAM6_NEHALEM: /* AAP115 */
- case INTEL_FAM6_WESTMERE: /* AAT100 */
- case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */
- case INTEL_FAM6_NEHALEM_EX: /* BA97 */
- return true;
- default:
- break;
- }
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_NEHALEM_EP: /* AAK155 */
+ case INTEL_NEHALEM: /* AAP115 */
+ case INTEL_WESTMERE: /* AAT100 */
+ case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
+ case INTEL_NEHALEM_EX: /* BA97 */
+ return true;
+ default:
+ break;
}
return false;
@@ -2566,13 +2601,13 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
struct vmx_capability *vmx_cap)
{
- u32 vmx_msr_low, vmx_msr_high;
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
u64 _cpu_based_3rd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
+ u64 basic_msr;
u64 misc_msr;
int i;
@@ -2606,6 +2641,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_cpu_based_2nd_exec_control))
return -EIO;
}
+ if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
+
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
@@ -2630,6 +2668,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmx_cap->ept = 0;
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
vmx_cap->vpid) {
@@ -2691,29 +2730,29 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
_vmexit_control &= ~x_ctrl;
}
- rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+ rdmsrl(MSR_IA32_VMX_BASIC, basic_msr);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
- if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
return -EIO;
#ifdef CONFIG_X86_64
- /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
- if (vmx_msr_high & (1u<<16))
+ /*
+ * KVM expects to be able to shove all legal physical addresses into
+ * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
+ * 0 for processors that support Intel 64 architecture".
+ */
+ if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
return -EIO;
#endif
/* Require Write-Back (WB) memory type for VMCS accesses. */
- if (((vmx_msr_high >> 18) & 15) != 6)
+ if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
return -EIO;
rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
- vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
-
- vmcs_conf->revision_id = vmx_msr_low;
-
+ vmcs_conf->basic = basic_msr;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
@@ -2759,7 +2798,7 @@ static bool kvm_is_vmx_supported(void)
return supported;
}
-static int vmx_check_processor_compat(void)
+int vmx_check_processor_compat(void)
{
int cpu = raw_smp_processor_id();
struct vmcs_config vmcs_conf;
@@ -2801,7 +2840,7 @@ fault:
return -EFAULT;
}
-static int vmx_hardware_enable(void)
+int vmx_enable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2825,9 +2864,6 @@ static int vmx_hardware_enable(void)
return r;
}
- if (enable_ept)
- ept_sync_global();
-
return 0;
}
@@ -2841,7 +2877,7 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-static void vmx_hardware_disable(void)
+void vmx_disable_virtualization_cpu(void)
{
vmclear_local_loaded_vmcss();
@@ -2863,13 +2899,13 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
if (!pages)
return NULL;
vmcs = page_address(pages);
- memset(vmcs, 0, vmcs_config.size);
+ memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
/* KVM supports Enlightened VMCS v1 only */
if (kvm_is_using_evmcs())
vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
else
- vmcs->hdr.revision_id = vmcs_config.revision_id;
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
if (shadow)
vmcs->hdr.shadow_vmcs = 1;
@@ -2962,7 +2998,7 @@ static __init int alloc_kvm_area(void)
* physical CPU.
*/
if (kvm_is_using_evmcs())
- vmcs->hdr.revision_id = vmcs_config.revision_id;
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
per_cpu(vmxarea, cpu) = vmcs;
}
@@ -3155,7 +3191,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
#endif
-static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3185,7 +3221,7 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
return to_vmx(vcpu)->vpid;
}
-static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
u64 root_hpa = mmu->root.hpa;
@@ -3201,7 +3237,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
-static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
* vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
@@ -3210,7 +3246,7 @@ static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
-static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
* vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
@@ -3255,7 +3291,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
CPU_BASED_CR3_STORE_EXITING)
-static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
if (is_guest_mode(vcpu))
return nested_guest_cr0_valid(vcpu, cr0);
@@ -3376,8 +3412,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
return eptp;
}
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
- int root_level)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
@@ -3406,8 +3441,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmcs_writel(GUEST_CR3, guest_cr3);
}
-
-static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
/*
* We operate under the default treatment of SMM, so VMX cannot be
@@ -3523,7 +3557,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
var->g = (ar >> 15) & 1;
}
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_segment s;
@@ -3600,14 +3634,14 @@ void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
}
-static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
__vmx_set_segment(vcpu, var, seg);
to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
}
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
@@ -3615,25 +3649,25 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
*l = (ar >> 13) & 1;
}
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
dt->address = vmcs_readl(GUEST_IDTR_BASE);
}
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
vmcs_writel(GUEST_IDTR_BASE, dt->address);
}
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
dt->address = vmcs_readl(GUEST_GDTR_BASE);
}
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
vmcs_writel(GUEST_GDTR_BASE, dt->address);
@@ -4101,27 +4135,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- void *vapic_page;
- u32 vppr;
- int rvi;
-
- if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
- !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
- WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
- return false;
-
- rvi = vmx_get_rvi();
-
- vapic_page = vmx->nested.virtual_apic_map.hva;
- vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
-
- return ((rvi & 0xf0) > (vppr & 0xf0));
-}
-
-static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 i;
@@ -4201,6 +4215,13 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
+ * and freed, and must not be accessed outside of vcpu->mutex. The
+ * vCPU's cached PI NV is valid if and only if posted interrupts
+ * enabled in its vmcs12, i.e. checking the vector also checks that
+ * L1 has enabled posted interrupts for L2.
+ */
if (is_guest_mode(vcpu) &&
vector == vmx->nested.posted_intr_nv) {
/*
@@ -4265,8 +4286,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
return 0;
}
-static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
- int trig_mode, int vector)
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
{
struct kvm_vcpu *vcpu = apic->vcpu;
@@ -4350,7 +4371,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
}
if (cpu_has_load_ia32_efer())
- vmcs_write64(HOST_IA32_EFER, host_efer);
+ vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
}
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -4428,7 +4449,7 @@ static u32 vmx_vmexit_ctrl(void)
~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
}
-static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4594,6 +4615,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
enable_unrestricted_guest = 0;
}
if (!enable_unrestricted_guest)
@@ -4692,7 +4714,7 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
return 0;
}
-static int vmx_vcpu_precreate(struct kvm *kvm)
+int vmx_vcpu_precreate(struct kvm *kvm)
{
return vmx_alloc_ipiv_pid_table(kvm);
}
@@ -4717,8 +4739,12 @@ static void init_vmcs(struct vcpu_vmx *vmx)
exec_controls_set(vmx, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls())
+ if (cpu_has_secondary_exec_ctrls()) {
secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS,
+ __pa(vmx->ve_info));
+ }
if (cpu_has_tertiary_exec_ctrls())
tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
@@ -4844,10 +4870,10 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
* or POSTED_INTR_WAKEUP_VECTOR.
*/
vmx->pi_desc.nv = POSTED_INTR_VECTOR;
- vmx->pi_desc.sn = 1;
+ __pi_set_sn(&vmx->pi_desc);
}
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4906,12 +4932,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_update_fb_clear_dis(vcpu, vmx);
}
-static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
{
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
-static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
{
if (!enable_vnmi ||
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
@@ -4922,7 +4948,7 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
uint32_t intr;
@@ -4950,7 +4976,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
vmx_clear_hlt(vcpu);
}
-static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
+void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5028,7 +5054,7 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
GUEST_INTR_STATE_NMI));
}
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
@@ -5040,17 +5066,22 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !vmx_nmi_blocked(vcpu);
}
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
return false;
- return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+ return __vmx_interrupt_blocked(vcpu);
}
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
@@ -5065,7 +5096,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !vmx_interrupt_blocked(vcpu);
}
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
void __user *ret;
@@ -5085,7 +5116,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
return init_rmode_tss(kvm, ret);
}
-static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
{
to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
return 0;
@@ -5206,6 +5237,16 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
if (is_invalid_opcode(intr_info))
return handle_ud(vcpu);
+ if (WARN_ON_ONCE(is_ve_fault(intr_info))) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+
+ WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION,
+ "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason);
+ dump_vmcs(vcpu);
+ kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE");
+ return 1;
+ }
+
error_code = 0;
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
@@ -5371,8 +5412,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
return kvm_fast_pio(vcpu, size, port, in);
}
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
/*
* Patch in the VMCALL instruction:
@@ -5578,7 +5618,7 @@ out:
return kvm_complete_insn_gp(vcpu, err);
}
-static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
@@ -5597,7 +5637,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
-static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
}
@@ -5767,10 +5807,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
? PFERR_PRESENT_MASK : 0;
- error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
- PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
-
- vcpu->arch.exit_qualification = exit_qualification;
+ if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+ error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
+ PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
/*
* Check that the GPA doesn't exceed physical memory limits, as that is
@@ -5868,7 +5907,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return 1;
}
-static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
if (vmx_emulation_required_with_pending_exception(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
@@ -5878,38 +5917,6 @@ static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
return 1;
}
-static void grow_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned int old = vmx->ple_window;
-
- vmx->ple_window = __grow_ple_window(old, ple_window,
- ple_window_grow,
- ple_window_max);
-
- if (vmx->ple_window != old) {
- vmx->ple_window_dirty = true;
- trace_kvm_ple_window_update(vcpu->vcpu_id,
- vmx->ple_window, old);
- }
-}
-
-static void shrink_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned int old = vmx->ple_window;
-
- vmx->ple_window = __shrink_ple_window(old, ple_window,
- ple_window_shrink,
- ple_window);
-
- if (vmx->ple_window != old) {
- vmx->ple_window_dirty = true;
- trace_kvm_ple_window_update(vcpu->vcpu_id,
- vmx->ple_window, old);
- }
-}
-
/*
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -6156,9 +6163,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
- u64 *info1, u64 *info2,
- u32 *intr_info, u32 *error_code)
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6416,6 +6422,24 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
pr_err("Virtual processor ID = 0x%04x\n",
vmcs_read16(VIRTUAL_PROCESSOR_ID));
+ if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+ u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
+
+ /*
+ * If KVM is dumping the VMCS, then something has gone wrong
+ * already. Derefencing an address from the VMCS, which could
+ * very well be corrupted, is a terrible idea. The virtual
+ * address is known so use it.
+ */
+ pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
+ ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
+ pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
+ ve_info->exit_reason, ve_info->delivery,
+ ve_info->exit_qualification,
+ ve_info->guest_linear_address,
+ ve_info->guest_physical_address, ve_info->eptp_index);
+ }
}
/*
@@ -6601,7 +6625,7 @@ unexpected_vmexit:
return 0;
}
-static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
int ret = __vmx_handle_exit(vcpu, exit_fastpath);
@@ -6641,9 +6665,10 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
bool flush_l1d;
/*
- * Clear the per-vcpu flush bit, it gets set again
- * either from vcpu_run() or from one of the unsafe
- * VMEXIT handlers.
+ * Clear the per-vcpu flush bit, it gets set again if the vCPU
+ * is reloaded, i.e. if the vCPU is scheduled out or if KVM
+ * exits to userspace, or if KVM reaches one of the unsafe
+ * VMEXIT handlers, e.g. if KVM calls into the emulator.
*/
flush_l1d = vcpu->arch.l1tf_flush_l1d;
vcpu->arch.l1tf_flush_l1d = false;
@@ -6689,7 +6714,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
: "eax", "ebx", "ecx", "edx");
}
-static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
int tpr_threshold;
@@ -6759,7 +6784,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
vmx_update_msr_bitmap_x2apic(vcpu);
}
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
{
const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
@@ -6828,7 +6853,7 @@ out:
kvm_release_pfn_clean(pfn);
}
-static void vmx_hwapic_isr_update(int max_isr)
+void vmx_hwapic_isr_update(int max_isr)
{
u16 status;
u8 old;
@@ -6862,7 +6887,7 @@ static void vmx_set_rvi(int vector)
}
}
-static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
{
/*
* When running L2, updating RVI is only relevant when
@@ -6876,7 +6901,7 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
vmx_set_rvi(max_irr);
}
-static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
@@ -6922,7 +6947,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
return max_irr;
}
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
return;
@@ -6933,7 +6958,7 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}
-static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
+void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6964,24 +6989,22 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
-static void handle_exception_irqoff(struct vcpu_vmx *vmx)
+static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
{
- u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
-
/* if exit due to PF check for async PF */
if (is_page_fault(intr_info))
- vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
/* if exit due to NM, handle before interrupts are enabled */
else if (is_nm_fault(intr_info))
- handle_nm_fault_irqoff(&vmx->vcpu);
+ handle_nm_fault_irqoff(vcpu);
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
}
-static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
+static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
+ u32 intr_info)
{
- u32 intr_info = vmx_get_intr_info(vcpu);
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
@@ -6998,7 +7021,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
vcpu->arch.at_instruction_boundary = true;
}
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7006,16 +7029,16 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
return;
if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
- handle_external_interrupt_irqoff(vcpu);
+ handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
- handle_exception_irqoff(vmx);
+ handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
}
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
*/
-static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
@@ -7138,7 +7161,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
IDT_VECTORING_ERROR_CODE);
}
-static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+void vmx_cancel_injection(struct kvm_vcpu *vcpu)
{
__vmx_complete_interrupts(vcpu,
vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
@@ -7246,6 +7269,8 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+ case EXIT_REASON_HLT:
+ return handle_fastpath_hlt(vcpu);
default:
return EXIT_FASTPATH_NONE;
}
@@ -7308,7 +7333,7 @@ out:
guest_state_exit_irqoff();
}
-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7463,7 +7488,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
}
-static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
+void vmx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7472,9 +7497,10 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
free_vpid(vmx->vpid);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
+ free_page((unsigned long)vmx->ve_info);
}
-static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
+int vmx_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
@@ -7565,6 +7591,20 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
goto free_vmcs;
}
+ err = -ENOMEM;
+ if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct page *page;
+
+ BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
+
+ /* ve_info must be page aligned. */
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!page)
+ goto free_vmcs;
+
+ vmx->ve_info = page_to_virt(page);
+ }
+
if (vmx_can_use_ipiv(vcpu))
WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
__pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
@@ -7583,7 +7623,7 @@ free_vpid:
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
-static int vmx_vm_init(struct kvm *kvm)
+int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
kvm->arch.pause_in_guest = true;
@@ -7614,41 +7654,25 @@ static int vmx_vm_init(struct kvm *kvm)
return 0;
}
-static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
- * memory aliases with conflicting memory types and sometimes MCEs.
- * We have to be careful as to what are honored and when.
- *
- * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
- * UC. The effective memory type is UC or WC depending on guest PAT.
- * This was historically the source of MCEs and we want to be
- * conservative.
- *
- * When there is no need to deal with noncoherent DMA (e.g., no VT-d
- * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
- * EPT memory type is set to WB. The effective memory type is forced
- * WB.
- *
- * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
- * EPT memory type is used to emulate guest CD/MTRR.
+ /*
+ * Force UC for host MMIO regions, as allowing the guest to access MMIO
+ * with cacheable accesses will result in Machine Checks.
*/
-
if (is_mmio)
return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+ /*
+ * Force WB and ignore guest PAT if the VM does NOT have a non-coherent
+ * device attached. Letting the guest control memory types on Intel
+ * CPUs may result in unexpected behavior, and so KVM's ABI is to trust
+ * the guest to behave only as a last resort.
+ */
if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
- if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
- if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
- else
- return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
- VMX_EPT_IPAT_BIT;
- }
-
- return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
}
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
@@ -7786,7 +7810,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
-static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7947,6 +7971,7 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
kvm_cpu_cap_clear(X86_FEATURE_SGX1);
kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
}
if (vmx_umip_emulated())
@@ -8001,10 +8026,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
}
-static int vmx_check_intercept(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info,
- enum x86_intercept_stage stage,
- struct x86_exception *exception)
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -8084,8 +8109,8 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
return 0;
}
-static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
- bool *expired)
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
{
struct vcpu_vmx *vmx;
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
@@ -8124,18 +8149,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
return 0;
}
-static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
to_vmx(vcpu)->hv_deadline_tsc = -1;
}
#endif
-static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
- if (!kvm_pause_in_guest(vcpu->kvm))
- shrink_ple_window(vcpu);
-}
-
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8159,7 +8178,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
-static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
@@ -8170,7 +8189,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
}
#ifdef CONFIG_KVM_SMM
-static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
/* we need a nested vmexit to enter SMM, postpone if run is pending */
if (to_vmx(vcpu)->nested.nested_run_pending)
@@ -8178,7 +8197,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !is_smm(vcpu);
}
-static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8199,7 +8218,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
return 0;
}
-static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -8220,18 +8239,18 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
return 0;
}
-static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
#endif
-static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
}
-static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
+void vmx_migrate_timers(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
@@ -8241,7 +8260,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
}
}
-static void vmx_hardware_unsetup(void)
+void vmx_hardware_unsetup(void)
{
kvm_set_posted_intr_wakeup_handler(NULL);
@@ -8251,18 +8270,7 @@ static void vmx_hardware_unsetup(void)
free_kvm_area();
}
-#define VMX_REQUIRED_APICV_INHIBITS \
-( \
- BIT(APICV_INHIBIT_REASON_DISABLE)| \
- BIT(APICV_INHIBIT_REASON_ABSENT) | \
- BIT(APICV_INHIBIT_REASON_HYPERV) | \
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
- BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
-)
-
-static void vmx_vm_destroy(struct kvm *kvm)
+void vmx_vm_destroy(struct kvm *kvm)
{
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
@@ -8313,148 +8321,6 @@ gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags
return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
}
-static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .name = KBUILD_MODNAME,
-
- .check_processor_compatibility = vmx_check_processor_compat,
-
- .hardware_unsetup = vmx_hardware_unsetup,
-
- .hardware_enable = vmx_hardware_enable,
- .hardware_disable = vmx_hardware_disable,
- .has_emulated_msr = vmx_has_emulated_msr,
-
- .vm_size = sizeof(struct kvm_vmx),
- .vm_init = vmx_vm_init,
- .vm_destroy = vmx_vm_destroy,
-
- .vcpu_precreate = vmx_vcpu_precreate,
- .vcpu_create = vmx_vcpu_create,
- .vcpu_free = vmx_vcpu_free,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .update_exception_bitmap = vmx_update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .is_valid_cr0 = vmx_is_valid_cr0,
- .set_cr0 = vmx_set_cr0,
- .is_valid_cr4 = vmx_is_valid_cr4,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .set_dr7 = vmx_set_dr7,
- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
- .get_if_flag = vmx_get_if_flag,
-
- .flush_tlb_all = vmx_flush_tlb_all,
- .flush_tlb_current = vmx_flush_tlb_current,
- .flush_tlb_gva = vmx_flush_tlb_gva,
- .flush_tlb_guest = vmx_flush_tlb_guest,
-
- .vcpu_pre_run = vmx_vcpu_pre_run,
- .vcpu_run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = vmx_skip_emulated_instruction,
- .update_emulated_instruction = vmx_update_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .inject_irq = vmx_inject_irq,
- .inject_nmi = vmx_inject_nmi,
- .inject_exception = vmx_inject_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = vmx_enable_nmi_window,
- .enable_irq_window = vmx_enable_irq_window,
- .update_cr8_intercept = vmx_update_cr8_intercept,
- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vmx_load_eoi_exitmap,
- .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
- .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
- .hwapic_irr_update = vmx_hwapic_irr_update,
- .hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
- .sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_interrupt = vmx_deliver_interrupt,
- .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
-
- .set_tss_addr = vmx_set_tss_addr,
- .set_identity_map_addr = vmx_set_identity_map_addr,
- .get_mt_mask = vmx_get_mt_mask,
-
- .get_exit_info = vmx_get_exit_info,
-
- .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
- .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
- .write_tsc_offset = vmx_write_tsc_offset,
- .write_tsc_multiplier = vmx_write_tsc_multiplier,
-
- .load_mmu_pgd = vmx_load_mmu_pgd,
-
- .check_intercept = vmx_check_intercept,
- .handle_exit_irqoff = vmx_handle_exit_irqoff,
-
- .sched_in = vmx_sched_in,
-
- .cpu_dirty_log_size = PML_ENTITY_NUM,
- .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
-
- .nested_ops = &vmx_nested_ops,
-
- .pi_update_irte = vmx_pi_update_irte,
- .pi_start_assignment = vmx_pi_start_assignment,
-
-#ifdef CONFIG_X86_64
- .set_hv_timer = vmx_set_hv_timer,
- .cancel_hv_timer = vmx_cancel_hv_timer,
-#endif
-
- .setup_mce = vmx_setup_mce,
-
-#ifdef CONFIG_KVM_SMM
- .smi_allowed = vmx_smi_allowed,
- .enter_smm = vmx_enter_smm,
- .leave_smm = vmx_leave_smm,
- .enable_smi_window = vmx_enable_smi_window,
-#endif
-
- .check_emulate_instruction = vmx_check_emulate_instruction,
- .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
- .migrate_timers = vmx_migrate_timers,
-
- .msr_filter_changed = vmx_msr_filter_changed,
- .complete_emulated_msr = kvm_complete_insn_gp,
-
- .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
-
- .get_untagged_addr = vmx_get_untagged_addr,
-};
-
static unsigned int vmx_handle_intel_pt_intr(void)
{
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
@@ -8500,18 +8366,16 @@ static void __init vmx_setup_me_spte_mask(void)
u64 me_mask = 0;
/*
- * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use
- * the former to avoid exposing shadow_phys_bits.
- *
* On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
- * shadow_phys_bits. On MKTME and/or TDX capable systems,
+ * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems,
* boot_cpu_data.x86_phys_bits holds the actual physical address
- * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
- * reported by CPUID. Those bits between are KeyID bits.
+ * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
+ * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
*/
- if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
+ if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
- kvm_get_shadow_phys_bits() - 1);
+ kvm_host.maxphyaddr - 1);
+
/*
* Unlike SME, host kernel doesn't support setting up any
* MKTME KeyID on Intel platforms. No memory encryption
@@ -8520,9 +8384,7 @@ static void __init vmx_setup_me_spte_mask(void)
kvm_mmu_set_me_spte_mask(0, me_mask);
}
-static struct kvm_x86_init_ops vmx_init_ops __initdata;
-
-static __init int hardware_setup(void)
+__init int vmx_hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
@@ -8591,16 +8453,16 @@ static __init int hardware_setup(void)
* using the APIC_ACCESS_ADDR VMCS field.
*/
if (!flexpriority_enabled)
- vmx_x86_ops.set_apic_access_page_addr = NULL;
+ vt_x86_ops.set_apic_access_page_addr = NULL;
if (!cpu_has_vmx_tpr_shadow())
- vmx_x86_ops.update_cr8_intercept = NULL;
+ vt_x86_ops.update_cr8_intercept = NULL;
#if IS_ENABLED(CONFIG_HYPERV)
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
&& enable_ept) {
- vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
- vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
+ vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
#endif
@@ -8615,7 +8477,7 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_apicv())
enable_apicv = 0;
if (!enable_apicv)
- vmx_x86_ops.sync_pir_to_irr = NULL;
+ vt_x86_ops.sync_pir_to_irr = NULL;
if (!enable_apicv || !cpu_has_vmx_ipiv())
enable_ipiv = false;
@@ -8651,7 +8513,7 @@ static __init int hardware_setup(void)
enable_pml = 0;
if (!enable_pml)
- vmx_x86_ops.cpu_dirty_log_size = 0;
+ vt_x86_ops.cpu_dirty_log_size = 0;
if (!cpu_has_vmx_preemption_timer())
enable_preemption_timer = false;
@@ -8660,7 +8522,7 @@ static __init int hardware_setup(void)
u64 use_timer_freq = 5000ULL * 1000 * 1000;
cpu_preemption_timer_multi =
- vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ vmx_misc_preemption_timer_rate(vmcs_config.misc);
if (tsc_khz)
use_timer_freq = (u64)tsc_khz * 1000;
@@ -8676,8 +8538,8 @@ static __init int hardware_setup(void)
}
if (!enable_preemption_timer) {
- vmx_x86_ops.set_hv_timer = NULL;
- vmx_x86_ops.cancel_hv_timer = NULL;
+ vt_x86_ops.set_hv_timer = NULL;
+ vt_x86_ops.cancel_hv_timer = NULL;
}
kvm_caps.supported_mce_cap |= MCG_LMCE_P;
@@ -8688,9 +8550,9 @@ static __init int hardware_setup(void)
if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
if (pt_mode == PT_MODE_HOST_GUEST)
- vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
else
- vmx_init_ops.handle_intel_pt_intr = NULL;
+ vt_init_ops.handle_intel_pt_intr = NULL;
setup_default_sgx_lepubkeyhash();
@@ -8713,14 +8575,6 @@ static __init int hardware_setup(void)
return r;
}
-static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- .hardware_setup = hardware_setup,
- .handle_intel_pt_intr = NULL,
-
- .runtime_ops = &vmx_x86_ops,
- .pmu_ops = &intel_pmu_ops,
-};
-
static void vmx_cleanup_l1d_flush(void)
{
if (vmx_l1d_flush_pages) {
@@ -8735,17 +8589,15 @@ static void __vmx_exit(void)
{
allow_smaller_maxphyaddr = false;
- cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
-
vmx_cleanup_l1d_flush();
}
static void vmx_exit(void)
{
kvm_exit();
+ __vmx_exit();
kvm_x86_vendor_exit();
- __vmx_exit();
}
module_exit(vmx_exit);
@@ -8762,7 +8614,7 @@ static int __init vmx_init(void)
*/
hv_init_evmcs();
- r = kvm_x86_vendor_init(&vmx_init_ops);
+ r = kvm_x86_vendor_init(&vt_init_ops);
if (r)
return r;
@@ -8783,8 +8635,6 @@ static int __init vmx_init(void)
pi_init_cpu(cpu);
}
- cpu_emergency_register_virt_callback(vmx_emergency_disable);
-
vmx_check_vmcs12_offsets();
/*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 90f9e4434646..2325f773a20b 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -7,20 +7,16 @@
#include <asm/kvm.h>
#include <asm/intel_pt.h>
#include <asm/perf_event.h>
+#include <asm/posted_intr.h>
#include "capabilities.h"
#include "../kvm_cache_regs.h"
-#include "posted_intr.h"
#include "vmcs.h"
#include "vmx_ops.h"
#include "../cpuid.h"
#include "run_flags.h"
#include "../mmu.h"
-#define MSR_TYPE_R 1
-#define MSR_TYPE_W 2
-#define MSR_TYPE_RW 3
-
#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
#ifdef CONFIG_X86_64
@@ -365,6 +361,9 @@ struct vcpu_vmx {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
} shadow_msr_intercept;
+
+ /* ve_info must be page aligned. */
+ struct vmx_ve_information *ve_info;
};
struct kvm_vmx {
@@ -403,6 +402,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
@@ -577,7 +577,8 @@ static inline u8 vmx_get_rvi(void)
SECONDARY_EXEC_ENABLE_VMFUNC | \
SECONDARY_EXEC_BUS_LOCK_DETECTION | \
SECONDARY_EXEC_NOTIFY_VM_EXITING | \
- SECONDARY_EXEC_ENCLS_EXITING)
+ SECONDARY_EXEC_ENCLS_EXITING | \
+ SECONDARY_EXEC_EPT_VIOLATION_VE)
#define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0
#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \
@@ -723,7 +724,7 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
return true;
return allow_smaller_maxphyaddr &&
- cpuid_maxphyaddr(vcpu) < kvm_get_shadow_phys_bits();
+ cpuid_maxphyaddr(vcpu) < kvm_host.maxphyaddr;
}
static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
@@ -751,4 +752,9 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && enable_ipiv;
}
+static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h
index eb48153bfd73..bba24ed99ee6 100644
--- a/arch/x86/kvm/vmx/vmx_onhyperv.h
+++ b/arch/x86/kvm/vmx/vmx_onhyperv.h
@@ -104,6 +104,14 @@ static inline void evmcs_load(u64 phys_addr)
struct hv_vp_assist_page *vp_ap =
hv_get_vp_assist_page(smp_processor_id());
+ /*
+ * When enabling eVMCS, KVM verifies that every CPU has a valid hv_vp_assist_page()
+ * and aborts enabling the feature otherwise. CPU onlining path is also checked in
+ * vmx_hardware_enable().
+ */
+ if (KVM_BUG_ON(!vp_ap, kvm_get_running_vcpu()->kvm))
+ return;
+
if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
vp_ap->nested_control.features.directhypercall = 1;
vp_ap->current_nested_vmcs = phys_addr;
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 8060e5fc6dbd..93e020dc88f6 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -47,7 +47,7 @@ static __always_inline void vmcs_check16(unsigned long field)
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
"16-bit accessor invalid for 64-bit high field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
- "16-bit accessor invalid for 32-bit high field");
+ "16-bit accessor invalid for 32-bit field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
"16-bit accessor invalid for natural width field");
}
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
new file mode 100644
index 000000000000..a55981c5216e
--- /dev/null
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_X86_OPS_H
+#define __KVM_X86_VMX_X86_OPS_H
+
+#include <linux/kvm_host.h>
+
+#include "x86.h"
+
+__init int vmx_hardware_setup(void);
+
+extern struct kvm_x86_ops vt_x86_ops __initdata;
+extern struct kvm_x86_init_ops vt_init_ops __initdata;
+
+void vmx_hardware_unsetup(void);
+int vmx_check_processor_compat(void);
+int vmx_enable_virtualization_cpu(void);
+void vmx_disable_virtualization_cpu(void);
+void vmx_emergency_disable_virtualization_cpu(void);
+int vmx_vm_init(struct kvm *kvm);
+void vmx_vm_destroy(struct kvm *kvm);
+int vmx_vcpu_precreate(struct kvm *kvm);
+int vmx_vcpu_create(struct kvm_vcpu *vcpu);
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu);
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+void vmx_vcpu_free(struct kvm_vcpu *vcpu);
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_put(struct kvm_vcpu *vcpu);
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath);
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu);
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu);
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+#ifdef CONFIG_KVM_SMM
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram);
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu);
+#endif
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception);
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu);
+void vmx_migrate_timers(struct kvm_vcpu *vcpu);
+void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
+void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
+void vmx_hwapic_isr_update(int max_isr);
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu);
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector);
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index);
+void vmx_msr_filter_changed(struct kvm_vcpu *vcpu);
+void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
+int vmx_get_feature_msr(u32 msr, u64 *data);
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
+void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int vmx_get_cpl(struct kvm_vcpu *vcpu);
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
+void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr);
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu);
+void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
+u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall);
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected);
+void vmx_inject_nmi(struct kvm_vcpu *vcpu);
+void vmx_inject_exception(struct kvm_vcpu *vcpu);
+void vmx_cancel_injection(struct kvm_vcpu *vcpu);
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu);
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu);
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr);
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu);
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr);
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu);
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_X86_64
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired);
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
+#endif
+void vmx_setup_mce(struct kvm_vcpu *vcpu);
+
+#endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 91478b769af0..83fe0a78146f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -92,11 +92,17 @@
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
-struct kvm_caps kvm_caps __read_mostly = {
- .supported_mce_cap = MCG_CTL_P | MCG_SER_P,
-};
+/*
+ * Note, kvm_caps fields should *never* have default values, all fields must be
+ * recomputed from scratch during vendor module load, e.g. to account for a
+ * vendor module being reloaded with different module parameters.
+ */
+struct kvm_caps kvm_caps __read_mostly;
EXPORT_SYMBOL_GPL(kvm_caps);
+struct kvm_host_values kvm_host __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_host);
+
#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
#define emul_to_vcpu(ctxt) \
@@ -161,15 +167,6 @@ module_param(kvmclock_periodic_sync, bool, 0444);
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, 0644);
-/*
- * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
- * adaptive tuning starting from default advancement of 1000ns. '0' disables
- * advancement entirely. Any other value is used as-is and disables adaptive
- * tuning, i.e. allows privileged userspace to set an exact advancement time.
- */
-static int __read_mostly lapic_timer_advance_ns = -1;
-module_param(lapic_timer_advance_ns, int, 0644);
-
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, 0444);
@@ -226,21 +223,12 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs;
| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
| XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
-u64 __read_mostly host_efer;
-EXPORT_SYMBOL_GPL(host_efer);
-
bool __read_mostly allow_smaller_maxphyaddr = 0;
EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
EXPORT_SYMBOL_GPL(enable_apicv);
-u64 __read_mostly host_xss;
-EXPORT_SYMBOL_GPL(host_xss);
-
-u64 __read_mostly host_arch_capabilities;
-EXPORT_SYMBOL_GPL(host_arch_capabilities);
-
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
@@ -314,29 +302,240 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
sizeof(kvm_vcpu_stats_desc),
};
-u64 __read_mostly host_xcr0;
-
static struct kmem_cache *x86_emulator_cache;
/*
- * When called, it means the previous get/set msr reached an invalid msr.
- * Return true if we want to ignore/silent this failed msr access.
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
+ * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
+ * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that
+ * require host support, i.e. should be probed via RDMSR. emulated_msrs holds
+ * MSRs that KVM emulates without strictly requiring host support.
+ * msr_based_features holds MSRs that enumerate features, i.e. are effectively
+ * CPUID leafs. Note, msr_based_features isn't mutually exclusive with
+ * msrs_to_save and emulated_msrs.
+ */
+
+static const u32 msrs_to_save_base[] = {
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_STAR,
+#ifdef CONFIG_X86_64
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
+ MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
+ MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
+ MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
+ MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
+ MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
+ MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
+ MSR_IA32_UMWAIT_CONTROL,
+
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR,
+};
+
+static const u32 msrs_to_save_pmu[] = {
+ MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
+ MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
+ MSR_CORE_PERF_GLOBAL_CTRL,
+ MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
+
+ /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
+ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
+ MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
+ MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
+ MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
+ MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
+
+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+
+ /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+
+ MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
+};
+
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
+ ARRAY_SIZE(msrs_to_save_pmu)];
+static unsigned num_msrs_to_save;
+
+static const u32 emulated_msrs_all[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+
+#ifdef CONFIG_KVM_HYPERV
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
+ HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
+ HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ HV_X64_MSR_RESET,
+ HV_X64_MSR_VP_INDEX,
+ HV_X64_MSR_VP_RUNTIME,
+ HV_X64_MSR_SCONTROL,
+ HV_X64_MSR_STIMER0_CONFIG,
+ HV_X64_MSR_VP_ASSIST_PAGE,
+ HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
+ HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
+ HV_X64_MSR_SYNDBG_OPTIONS,
+ HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
+ HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
+ HV_X64_MSR_SYNDBG_PENDING_BUFFER,
+#endif
+
+ MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
+
+ MSR_IA32_TSC_ADJUST,
+ MSR_IA32_TSC_DEADLINE,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
+ MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MCG_STATUS,
+ MSR_IA32_MCG_CTL,
+ MSR_IA32_MCG_EXT_CTL,
+ MSR_IA32_SMBASE,
+ MSR_SMI_COUNT,
+ MSR_PLATFORM_INFO,
+ MSR_MISC_FEATURES_ENABLES,
+ MSR_AMD64_VIRT_SPEC_CTRL,
+ MSR_AMD64_TSC_RATIO,
+ MSR_IA32_POWER_CTL,
+ MSR_IA32_UCODE_REV,
+
+ /*
+ * KVM always supports the "true" VMX control MSRs, even if the host
+ * does not. The VMX MSRs as a whole are considered "emulated" as KVM
+ * doesn't strictly require them to exist in the host (ignoring that
+ * KVM would refuse to load in the first place if the core set of MSRs
+ * aren't supported).
+ */
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
+ MSR_K7_HWCR,
+ MSR_KVM_POLL_CONTROL,
+};
+
+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
+static unsigned num_emulated_msrs;
+
+/*
+ * List of MSRs that control the existence of MSR-based features, i.e. MSRs
+ * that are effectively CPUID leafs. VMX MSRs are also included in the set of
+ * feature MSRs, but are handled separately to allow expedited lookups.
+ */
+static const u32 msr_based_features_all_except_vmx[] = {
+ MSR_AMD64_DE_CFG,
+ MSR_IA32_UCODE_REV,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
+};
+
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
+ (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
+static unsigned int num_msr_based_features;
+
+/*
+ * All feature MSRs except uCode revID, which tracks the currently loaded uCode
+ * patch, are immutable once the vCPU model is defined.
*/
-static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
+static bool kvm_is_immutable_feature_msr(u32 msr)
{
- const char *op = write ? "wrmsr" : "rdmsr";
+ int i;
- if (ignore_msrs) {
- if (report_ignored_msrs)
- kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
- op, msr, data);
- /* Mask the error */
+ if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
return true;
- } else {
+
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
+ if (msr == msr_based_features_all_except_vmx[i])
+ return msr != MSR_IA32_UCODE_REV;
+ }
+
+ return false;
+}
+
+static bool kvm_is_advertised_msr(u32 msr_index)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_msrs_to_save; i++) {
+ if (msrs_to_save[i] == msr_index)
+ return true;
+ }
+
+ for (i = 0; i < num_emulated_msrs; i++) {
+ if (emulated_msrs[i] == msr_index)
+ return true;
+ }
+
+ return false;
+}
+
+typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated);
+
+static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr,
+ u64 *data, bool host_initiated,
+ enum kvm_msr_access rw,
+ msr_access_t msr_access_fn)
+{
+ const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr";
+ int ret;
+
+ BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W);
+
+ /*
+ * Zero the data on read failures to avoid leaking stack data to the
+ * guest and/or userspace, e.g. if the failure is ignored below.
+ */
+ ret = msr_access_fn(vcpu, msr, data, host_initiated);
+ if (ret && rw == MSR_TYPE_R)
+ *data = 0;
+
+ if (ret != KVM_MSR_RET_UNSUPPORTED)
+ return ret;
+
+ /*
+ * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM
+ * advertises to userspace, even if an MSR isn't fully supported.
+ * Simply check that @data is '0', which covers both the write '0' case
+ * and all reads (in which case @data is zeroed on failure; see above).
+ */
+ if (host_initiated && !*data && kvm_is_advertised_msr(msr))
+ return 0;
+
+ if (!ignore_msrs) {
kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
- op, msr, data);
- return false;
+ op, msr, *data);
+ return ret;
}
+
+ if (report_ignored_msrs)
+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data);
+
+ return 0;
}
static struct kmem_cache *kvm_alloc_emulator_cache(void)
@@ -369,7 +568,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
/*
* Disabling irqs at this point since the following code could be
- * interrupted and executed through kvm_arch_hardware_disable()
+ * interrupted and executed through kvm_arch_disable_virtualization_cpu()
*/
local_irq_save(flags);
if (msrs->registered) {
@@ -427,8 +626,7 @@ EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
static void kvm_user_return_msr_cpu_online(void)
{
- unsigned int cpu = smp_processor_id();
- struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
u64 value;
int i;
@@ -441,8 +639,7 @@ static void kvm_user_return_msr_cpu_online(void)
int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
{
- unsigned int cpu = smp_processor_id();
- struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
int err;
value = (value & mask) | (msrs->values[slot].host & ~mask);
@@ -464,8 +661,7 @@ EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
static void drop_user_return_notifiers(void)
{
- unsigned int cpu = smp_processor_id();
- struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
if (msrs->registered)
kvm_on_user_return(&msrs->urn);
@@ -637,12 +833,6 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto
ex->payload = payload;
}
-/* Forcibly leave the nested mode in cases like a vCPU reset */
-static void kvm_leave_nested(struct kvm_vcpu *vcpu)
-{
- kvm_x86_ops.nested_ops->leave_nested(vcpu);
-}
-
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
bool has_payload, unsigned long payload, bool reinject)
@@ -839,7 +1029,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
*/
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
{
- if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
+ if (kvm_x86_call(get_cpl)(vcpu) <= required_cpl)
return true;
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return false;
@@ -923,7 +1113,7 @@ static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
return false;
- return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0);
+ return kvm_x86_call(is_valid_cr0)(vcpu, cr0);
}
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
@@ -960,11 +1150,6 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
-
- if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
- kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
- !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
}
EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
@@ -987,7 +1172,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!is_pae(vcpu))
return 1;
- static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
if (cs_l)
return 1;
}
@@ -1001,7 +1186,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
(is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
return 1;
- static_call(kvm_x86_set_cr0)(vcpu, cr0);
+ kvm_x86_call(set_cr0)(vcpu, cr0);
kvm_post_set_cr0(vcpu, old_cr0, cr0);
@@ -1022,11 +1207,11 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
- if (vcpu->arch.xcr0 != host_xcr0)
+ if (vcpu->arch.xcr0 != kvm_host.xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != host_xss)
+ vcpu->arch.ia32_xss != kvm_host.xss)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
@@ -1053,12 +1238,12 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
- if (vcpu->arch.xcr0 != host_xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+ if (vcpu->arch.xcr0 != kvm_host.xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != host_xss)
- wrmsrl(MSR_IA32_XSS, host_xss);
+ vcpu->arch.ia32_xss != kvm_host.xss)
+ wrmsrl(MSR_IA32_XSS, kvm_host.xss);
}
}
@@ -1119,7 +1304,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
{
/* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */
- if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
+ if (kvm_x86_call(get_cpl)(vcpu) != 0 ||
__kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
kvm_inject_gp(vcpu, 0);
return 1;
@@ -1144,7 +1329,7 @@ EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4);
static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
return __kvm_is_valid_cr4(vcpu, cr4) &&
- static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
+ kvm_x86_call(is_valid_cr4)(vcpu, cr4);
}
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
@@ -1212,7 +1397,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
}
- static_call(kvm_x86_set_cr4)(vcpu, cr4);
+ kvm_x86_call(set_cr4)(vcpu, cr4);
kvm_post_set_cr4(vcpu, old_cr4, cr4);
@@ -1351,7 +1536,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu)
dr7 = vcpu->arch.guest_debug_dr7;
else
dr7 = vcpu->arch.dr7;
- static_call(kvm_x86_set_dr7)(vcpu, dr7);
+ kvm_x86_call(set_dr7)(vcpu, dr7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
if (dr7 & DR7_BP_EN_MASK)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
@@ -1433,178 +1618,6 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
/*
- * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
- * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
- * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that
- * require host support, i.e. should be probed via RDMSR. emulated_msrs holds
- * MSRs that KVM emulates without strictly requiring host support.
- * msr_based_features holds MSRs that enumerate features, i.e. are effectively
- * CPUID leafs. Note, msr_based_features isn't mutually exclusive with
- * msrs_to_save and emulated_msrs.
- */
-
-static const u32 msrs_to_save_base[] = {
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_STAR,
-#ifdef CONFIG_X86_64
- MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
- MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
- MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
- MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
- MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
- MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
- MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
- MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
- MSR_IA32_UMWAIT_CONTROL,
-
- MSR_IA32_XFD, MSR_IA32_XFD_ERR,
-};
-
-static const u32 msrs_to_save_pmu[] = {
- MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
- MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
- MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
- MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
- MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
-
- /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */
- MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
- MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
- MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
- MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
- MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
- MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
- MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
- MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
-
- MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
- MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
-
- /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */
- MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
- MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
- MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
- MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
-
- MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
- MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
- MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
-};
-
-static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
- ARRAY_SIZE(msrs_to_save_pmu)];
-static unsigned num_msrs_to_save;
-
-static const u32 emulated_msrs_all[] = {
- MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
-
-#ifdef CONFIG_KVM_HYPERV
- HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
- HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
- HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
- HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
- HV_X64_MSR_RESET,
- HV_X64_MSR_VP_INDEX,
- HV_X64_MSR_VP_RUNTIME,
- HV_X64_MSR_SCONTROL,
- HV_X64_MSR_STIMER0_CONFIG,
- HV_X64_MSR_VP_ASSIST_PAGE,
- HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
- HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
- HV_X64_MSR_SYNDBG_OPTIONS,
- HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
- HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
- HV_X64_MSR_SYNDBG_PENDING_BUFFER,
-#endif
-
- MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
- MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
-
- MSR_IA32_TSC_ADJUST,
- MSR_IA32_TSC_DEADLINE,
- MSR_IA32_ARCH_CAPABILITIES,
- MSR_IA32_PERF_CAPABILITIES,
- MSR_IA32_MISC_ENABLE,
- MSR_IA32_MCG_STATUS,
- MSR_IA32_MCG_CTL,
- MSR_IA32_MCG_EXT_CTL,
- MSR_IA32_SMBASE,
- MSR_SMI_COUNT,
- MSR_PLATFORM_INFO,
- MSR_MISC_FEATURES_ENABLES,
- MSR_AMD64_VIRT_SPEC_CTRL,
- MSR_AMD64_TSC_RATIO,
- MSR_IA32_POWER_CTL,
- MSR_IA32_UCODE_REV,
-
- /*
- * KVM always supports the "true" VMX control MSRs, even if the host
- * does not. The VMX MSRs as a whole are considered "emulated" as KVM
- * doesn't strictly require them to exist in the host (ignoring that
- * KVM would refuse to load in the first place if the core set of MSRs
- * aren't supported).
- */
- MSR_IA32_VMX_BASIC,
- MSR_IA32_VMX_TRUE_PINBASED_CTLS,
- MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
- MSR_IA32_VMX_TRUE_EXIT_CTLS,
- MSR_IA32_VMX_TRUE_ENTRY_CTLS,
- MSR_IA32_VMX_MISC,
- MSR_IA32_VMX_CR0_FIXED0,
- MSR_IA32_VMX_CR4_FIXED0,
- MSR_IA32_VMX_VMCS_ENUM,
- MSR_IA32_VMX_PROCBASED_CTLS2,
- MSR_IA32_VMX_EPT_VPID_CAP,
- MSR_IA32_VMX_VMFUNC,
-
- MSR_K7_HWCR,
- MSR_KVM_POLL_CONTROL,
-};
-
-static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
-static unsigned num_emulated_msrs;
-
-/*
- * List of MSRs that control the existence of MSR-based features, i.e. MSRs
- * that are effectively CPUID leafs. VMX MSRs are also included in the set of
- * feature MSRs, but are handled separately to allow expedited lookups.
- */
-static const u32 msr_based_features_all_except_vmx[] = {
- MSR_AMD64_DE_CFG,
- MSR_IA32_UCODE_REV,
- MSR_IA32_ARCH_CAPABILITIES,
- MSR_IA32_PERF_CAPABILITIES,
-};
-
-static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
- (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
-static unsigned int num_msr_based_features;
-
-/*
- * All feature MSRs except uCode revID, which tracks the currently loaded uCode
- * patch, are immutable once the vCPU model is defined.
- */
-static bool kvm_is_immutable_feature_msr(u32 msr)
-{
- int i;
-
- if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
- return true;
-
- for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
- if (msr == msr_based_features_all_except_vmx[i])
- return msr != MSR_IA32_UCODE_REV;
- }
-
- return false;
-}
-
-/*
* Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
* does not yet virtualize. These include:
* 10 - MISC_PACKAGE_CTRLS
@@ -1625,7 +1638,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr)
static u64 kvm_get_arch_capabilities(void)
{
- u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
+ u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
/*
* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
@@ -1681,40 +1694,31 @@ static u64 kvm_get_arch_capabilities(void)
return data;
}
-static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
+static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
{
- switch (msr->index) {
+ WARN_ON_ONCE(!host_initiated);
+
+ switch (index) {
case MSR_IA32_ARCH_CAPABILITIES:
- msr->data = kvm_get_arch_capabilities();
+ *data = kvm_get_arch_capabilities();
break;
case MSR_IA32_PERF_CAPABILITIES:
- msr->data = kvm_caps.supported_perf_cap;
+ *data = kvm_caps.supported_perf_cap;
break;
case MSR_IA32_UCODE_REV:
- rdmsrl_safe(msr->index, &msr->data);
+ rdmsrl_safe(index, data);
break;
default:
- return static_call(kvm_x86_get_msr_feature)(msr);
+ return kvm_x86_call(get_feature_msr)(index, data);
}
return 0;
}
-static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- struct kvm_msr_entry msr;
- int r;
-
- /* Unconditionally clear the output for simplicity */
- msr.data = 0;
- msr.index = index;
- r = kvm_get_msr_feature(&msr);
-
- if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false))
- r = 0;
-
- *data = msr.data;
-
- return r;
+ return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R,
+ kvm_get_feature_msr);
}
static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -1768,7 +1772,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
- r = static_call(kvm_x86_set_efer)(vcpu, efer);
+ r = kvm_x86_call(set_efer)(vcpu, efer);
if (r) {
WARN_ON(r > 0);
return r;
@@ -1883,11 +1887,11 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* incomplete and conflicting architectural behavior. Current
* AMD CPUs completely ignore bits 63:32, i.e. they aren't
* reserved and always read as zeros. Enforce Intel's reserved
- * bits check if and only if the guest CPU is Intel, and clear
- * the bits in all other cases. This ensures cross-vendor
- * migration will provide consistent behavior for the guest.
+ * bits check if the guest CPU is Intel compatible, otherwise
+ * clear the bits. This ensures cross-vendor migration will
+ * provide consistent behavior for the guest.
*/
- if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
+ if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0)
return 1;
data = (u32)data;
@@ -1898,19 +1902,20 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
msr.index = index;
msr.host_initiated = host_initiated;
- return static_call(kvm_x86_set_msr)(vcpu, &msr);
+ return kvm_x86_call(set_msr)(vcpu, &msr);
+}
+
+static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
+{
+ return __kvm_set_msr(vcpu, index, *data, host_initiated);
}
static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
u32 index, u64 data, bool host_initiated)
{
- int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
-
- if (ret == KVM_MSR_RET_INVALID)
- if (kvm_msr_ignored_check(index, data, true))
- ret = 0;
-
- return ret;
+ return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W,
+ _kvm_set_msr);
}
/*
@@ -1940,7 +1945,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
msr.index = index;
msr.host_initiated = host_initiated;
- ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
+ ret = kvm_x86_call(get_msr)(vcpu, &msr);
if (!ret)
*data = msr.data;
return ret;
@@ -1949,31 +1954,25 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
u32 index, u64 *data, bool host_initiated)
{
- int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
-
- if (ret == KVM_MSR_RET_INVALID) {
- /* Unconditionally clear *data for simplicity */
- *data = 0;
- if (kvm_msr_ignored_check(index, 0, false))
- ret = 0;
- }
-
- return ret;
+ return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R,
+ __kvm_get_msr);
}
-static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
return KVM_MSR_RET_FILTERED;
return kvm_get_msr_ignored_check(vcpu, index, data, false);
}
+EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter);
-static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
+int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
{
if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
return KVM_MSR_RET_FILTERED;
return kvm_set_msr_ignored_check(vcpu, index, data, false);
}
+EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
@@ -2008,7 +2007,7 @@ static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
{
- return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
+ return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error);
}
static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
@@ -2020,7 +2019,7 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
static u64 kvm_msr_reason(int r)
{
switch (r) {
- case KVM_MSR_RET_INVALID:
+ case KVM_MSR_RET_UNSUPPORTED:
return KVM_MSR_EXIT_REASON_UNKNOWN;
case KVM_MSR_RET_FILTERED:
return KVM_MSR_EXIT_REASON_FILTER;
@@ -2072,7 +2071,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
trace_kvm_msr_read_ex(ecx);
}
- return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
+ return kvm_x86_call(complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
@@ -2097,7 +2096,7 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
trace_kvm_msr_write_ex(ecx, data);
}
- return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
+ return kvm_x86_call(complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
@@ -2183,31 +2182,34 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
{
u32 msr = kvm_rcx_read(vcpu);
u64 data;
- fastpath_t ret = EXIT_FASTPATH_NONE;
+ fastpath_t ret;
+ bool handled;
kvm_vcpu_srcu_read_lock(vcpu);
switch (msr) {
case APIC_BASE_MSR + (APIC_ICR >> 4):
data = kvm_read_edx_eax(vcpu);
- if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
- kvm_skip_emulated_instruction(vcpu);
- ret = EXIT_FASTPATH_EXIT_HANDLED;
- }
+ handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
break;
case MSR_IA32_TSC_DEADLINE:
data = kvm_read_edx_eax(vcpu);
- if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
- kvm_skip_emulated_instruction(vcpu);
- ret = EXIT_FASTPATH_REENTER_GUEST;
- }
+ handled = !handle_fastpath_set_tscdeadline(vcpu, data);
break;
default:
+ handled = false;
break;
}
- if (ret != EXIT_FASTPATH_NONE)
+ if (handled) {
+ if (!kvm_skip_emulated_instruction(vcpu))
+ ret = EXIT_FASTPATH_EXIT_USERSPACE;
+ else
+ ret = EXIT_FASTPATH_REENTER_GUEST;
trace_kvm_msr_write(msr, data);
+ } else {
+ ret = EXIT_FASTPATH_NONE;
+ }
kvm_vcpu_srcu_read_unlock(vcpu);
@@ -2230,16 +2232,13 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
/*
* Disallow writes to immutable feature MSRs after KVM_RUN. KVM does
* not support modifying the guest vCPU model on the fly, e.g. changing
- * the nVMX capabilities while L2 is running is nonsensical. Ignore
+ * the nVMX capabilities while L2 is running is nonsensical. Allow
* writes of the same value, e.g. to allow userspace to blindly stuff
* all MSRs when emulating RESET.
*/
- if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) {
- if (do_get_msr(vcpu, index, &val) || *data != val)
- return -EINVAL;
-
- return 0;
- }
+ if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index) &&
+ (do_get_msr(vcpu, index, &val) || *data != val))
+ return -EINVAL;
return kvm_set_msr_ignored_check(vcpu, index, *data, true);
}
@@ -2625,12 +2624,12 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
if (is_guest_mode(vcpu))
vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
l1_offset,
- static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
- static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ kvm_x86_call(get_l2_tsc_offset)(vcpu),
+ kvm_x86_call(get_l2_tsc_multiplier)(vcpu));
else
vcpu->arch.tsc_offset = l1_offset;
- static_call(kvm_x86_write_tsc_offset)(vcpu);
+ kvm_x86_call(write_tsc_offset)(vcpu);
}
static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
@@ -2641,12 +2640,12 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli
if (is_guest_mode(vcpu))
vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
l1_multiplier,
- static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ kvm_x86_call(get_l2_tsc_multiplier)(vcpu));
else
vcpu->arch.tsc_scaling_ratio = l1_multiplier;
if (kvm_caps.has_tsc_control)
- static_call(kvm_x86_write_tsc_multiplier)(vcpu);
+ kvm_x86_call(write_tsc_multiplier)(vcpu);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -3619,7 +3618,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_flush_tlb_all)(vcpu);
+ kvm_x86_call(flush_tlb_all)(vcpu);
/* Flushing all ASIDs flushes the current ASID... */
kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
@@ -3640,7 +3639,7 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
kvm_mmu_sync_prev_roots(vcpu);
}
- static_call(kvm_x86_flush_tlb_guest)(vcpu);
+ kvm_x86_call(flush_tlb_guest)(vcpu);
/*
* Flushing all "guest" TLB is always a superset of Hyper-V's fine
@@ -3653,7 +3652,7 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- static_call(kvm_x86_flush_tlb_current)(vcpu);
+ kvm_x86_call(flush_tlb_current)(vcpu);
}
/*
@@ -3770,18 +3769,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
-static bool kvm_is_msr_to_save(u32 msr_index)
-{
- unsigned int i;
-
- for (i = 0; i < num_msrs_to_save; i++) {
- if (msrs_to_save[i] == msr_index)
- return true;
- }
-
- return false;
-}
-
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u32 msr = msr_info->index;
@@ -4163,15 +4150,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
- /*
- * Userspace is allowed to write '0' to MSRs that KVM reports
- * as to-be-saved, even if an MSRs isn't fully supported.
- */
- if (msr_info->host_initiated && !data &&
- kvm_is_msr_to_save(msr))
- break;
-
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
return 0;
}
@@ -4522,17 +4501,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
- /*
- * Userspace is allowed to read MSRs that KVM reports as
- * to-be-saved, even if an MSR isn't fully supported.
- */
- if (msr_info->host_initiated &&
- kvm_is_msr_to_save(msr_info->index)) {
- msr_info->data = 0;
- break;
- }
-
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
return 0;
}
@@ -4629,9 +4598,7 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
static bool kvm_is_vm_type_supported(unsigned long type)
{
- return type == KVM_X86_DEFAULT_VM ||
- (type == KVM_X86_SW_PROTECTED_VM &&
- IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled);
+ return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -4682,7 +4649,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ASYNC_PF_INT:
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_KVMCLOCK_CTRL:
- case KVM_CAP_READONLY_MEM:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
case KVM_CAP_TSC_DEADLINE_TIMER:
case KVM_CAP_DISABLE_QUIRKS:
@@ -4714,8 +4680,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_MEMORY_FAULT_INFO:
+ case KVM_CAP_X86_GUEST_MODE:
r = 1;
break;
+ case KVM_CAP_PRE_FAULT_MEMORY:
+ r = tdp_enabled;
+ break;
+ case KVM_CAP_X86_APIC_BUS_CYCLES_NS:
+ r = APIC_BUS_CYCLE_NS_DEFAULT;
+ break;
case KVM_CAP_EXIT_HYPERCALL:
r = KVM_EXIT_HYPERCALL_VALID_MASK;
break;
@@ -4764,7 +4737,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
* fringe case that is not enabled except via specific settings
* of the module parameters.
*/
- r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
+ r = kvm_x86_call(has_emulated_msr)(kvm, MSR_IA32_SMBASE);
break;
case KVM_CAP_NR_VCPUS:
r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
@@ -4832,9 +4805,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = kvm_caps.has_notify_vmexit;
break;
case KVM_CAP_VM_TYPES:
- r = BIT(KVM_X86_DEFAULT_VM);
- if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM))
- r |= BIT(KVM_X86_SW_PROTECTED_VM);
+ r = kvm_caps.supported_vm_types;
+ break;
+ case KVM_CAP_READONLY_MEM:
+ r = kvm ? kvm_arch_has_readonly_mem(kvm) : 1;
break;
default:
break;
@@ -4842,46 +4816,44 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
return r;
}
-static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
+static int __kvm_x86_dev_get_attr(struct kvm_device_attr *attr, u64 *val)
{
- void __user *uaddr = (void __user*)(unsigned long)attr->addr;
-
- if ((u64)(unsigned long)uaddr != attr->addr)
- return ERR_PTR_USR(-EFAULT);
- return uaddr;
-}
-
-static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
-{
- u64 __user *uaddr = kvm_get_attr_addr(attr);
-
- if (attr->group)
+ if (attr->group) {
+ if (kvm_x86_ops.dev_get_attr)
+ return kvm_x86_call(dev_get_attr)(attr->group, attr->attr, val);
return -ENXIO;
-
- if (IS_ERR(uaddr))
- return PTR_ERR(uaddr);
+ }
switch (attr->attr) {
case KVM_X86_XCOMP_GUEST_SUPP:
- if (put_user(kvm_caps.supported_xcr0, uaddr))
- return -EFAULT;
+ *val = kvm_caps.supported_xcr0;
return 0;
default:
return -ENXIO;
}
}
+static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = u64_to_user_ptr(attr->addr);
+ int r;
+ u64 val;
+
+ r = __kvm_x86_dev_get_attr(attr, &val);
+ if (r < 0)
+ return r;
+
+ if (put_user(val, uaddr))
+ return -EFAULT;
+
+ return 0;
+}
+
static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
{
- if (attr->group)
- return -ENXIO;
+ u64 val;
- switch (attr->attr) {
- case KVM_X86_XCOMP_GUEST_SUPP:
- return 0;
- default:
- return -ENXIO;
- }
+ return __kvm_x86_dev_get_attr(attr, &val);
}
long kvm_arch_dev_ioctl(struct file *filp,
@@ -4967,7 +4939,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
break;
}
case KVM_GET_MSRS:
- r = msr_io(NULL, argp, do_get_msr_feature, 1);
+ r = msr_io(NULL, argp, do_get_feature_msr, 1);
break;
#ifdef CONFIG_KVM_HYPERV
case KVM_GET_SUPPORTED_HV_CPUID:
@@ -5010,16 +4982,25 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
+ pmu->need_cleanup = true;
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+ }
+
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
- if (static_call(kvm_x86_has_wbinvd_exit)())
+ if (kvm_x86_call(has_wbinvd_exit)())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
smp_call_function_single(vcpu->cpu,
wbinvd_ipi, NULL, 1);
}
- static_call(kvm_x86_vcpu_load)(vcpu, cpu);
+ kvm_x86_call(vcpu_load)(vcpu, cpu);
/* Save host pkru register if supported */
vcpu->arch.host_pkru = read_pkru();
@@ -5127,14 +5108,14 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
- static_call(kvm_x86_vcpu_put)(vcpu);
+ kvm_x86_call(vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
}
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -5251,7 +5232,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
kvm_apic_after_set_mcg_cap(vcpu);
- static_call(kvm_x86_setup_mce)(vcpu);
+ kvm_x86_call(setup_mce)(vcpu);
out:
return r;
}
@@ -5411,11 +5392,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->interrupt.injected =
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
- events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ events->interrupt.shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
- events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
+ events->nmi.masked = kvm_x86_call(get_nmi_mask)(vcpu);
/* events->sipi_vector is never valid when reporting to user space */
@@ -5497,8 +5478,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
- static_call(kvm_x86_set_interrupt_shadow)(vcpu,
- events->interrupt.shadow);
+ kvm_x86_call(set_interrupt_shadow)(vcpu,
+ events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
@@ -5507,7 +5488,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->nmi.pending)
kvm_make_request(KVM_REQ_NMI, vcpu);
}
- static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
+ kvm_x86_call(set_nmi_mask)(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
lapic_in_kernel(vcpu))
@@ -5557,11 +5538,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
return 0;
}
-static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
- struct kvm_debugregs *dbgregs)
+static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
{
unsigned int i;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
memset(dbgregs, 0, sizeof(*dbgregs));
BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
@@ -5570,6 +5555,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
dbgregs->dr6 = vcpu->arch.dr6;
dbgregs->dr7 = vcpu->arch.dr7;
+ return 0;
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -5577,6 +5563,10 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
{
unsigned int i;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
if (dbgregs->flags)
return -EINVAL;
@@ -5597,8 +5587,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
}
-static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
- u8 *state, unsigned int size)
+static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+ u8 *state, unsigned int size)
{
/*
* Only copy state for features that are enabled for the guest. The
@@ -5616,24 +5606,25 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
XFEATURE_MASK_FPSSE;
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
supported_xcr0, vcpu->arch.pkru);
+ return 0;
}
-static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
- struct kvm_xsave *guest_xsave)
+static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
{
- kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
- sizeof(guest_xsave->region));
+ return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
+ sizeof(guest_xsave->region));
}
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
guest_xsave->region,
@@ -5641,18 +5632,23 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
&vcpu->arch.pkru);
}
-static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
- struct kvm_xcrs *guest_xcrs)
+static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
guest_xcrs->nr_xcrs = 0;
- return;
+ return 0;
}
guest_xcrs->nr_xcrs = 1;
guest_xcrs->flags = 0;
guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+ return 0;
}
static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
@@ -5660,6 +5656,10 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
{
int i, r = 0;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return -EINVAL;
@@ -5712,12 +5712,9 @@ static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
- u64 __user *uaddr = kvm_get_attr_addr(attr);
+ u64 __user *uaddr = u64_to_user_ptr(attr->addr);
int r;
- if (IS_ERR(uaddr))
- return PTR_ERR(uaddr);
-
switch (attr->attr) {
case KVM_VCPU_TSC_OFFSET:
r = -EFAULT;
@@ -5735,13 +5732,10 @@ static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
- u64 __user *uaddr = kvm_get_attr_addr(attr);
+ u64 __user *uaddr = u64_to_user_ptr(attr->addr);
struct kvm *kvm = vcpu->kvm;
int r;
- if (IS_ERR(uaddr))
- return PTR_ERR(uaddr);
-
switch (attr->attr) {
case KVM_VCPU_TSC_OFFSET: {
u64 offset, tsc, ns;
@@ -5842,7 +5836,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
if (!kvm_x86_ops.enable_l2_tlb_flush)
return -ENOTTY;
- return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu);
+ return kvm_x86_call(enable_l2_tlb_flush)(vcpu);
case KVM_CAP_HYPERV_ENFORCE_CPUID:
return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
@@ -5881,8 +5875,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!lapic_in_kernel(vcpu))
goto out;
- u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
- GFP_KERNEL_ACCOUNT);
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
if (!u.lapic)
@@ -6042,13 +6035,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
break;
+ kvm_vcpu_srcu_read_lock(vcpu);
r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ kvm_vcpu_srcu_read_unlock(vcpu);
break;
}
case KVM_GET_DEBUGREGS: {
struct kvm_debugregs dbgregs;
- kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+ r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+ if (r < 0)
+ break;
r = -EFAULT;
if (copy_to_user(argp, &dbgregs,
@@ -6073,12 +6070,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
break;
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
r = -ENOMEM;
if (!u.xsave)
break;
- kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+ r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+ if (r < 0)
+ break;
r = -EFAULT;
if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
@@ -6102,12 +6101,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_GET_XSAVE2: {
int size = vcpu->arch.guest_fpu.uabi_size;
- u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ u.xsave = kzalloc(size, GFP_KERNEL);
r = -ENOMEM;
if (!u.xsave)
break;
- kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+ r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+ if (r < 0)
+ break;
r = -EFAULT;
if (copy_to_user(argp, u.xsave, size))
@@ -6118,12 +6119,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_GET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
r = -ENOMEM;
if (!u.xcrs)
break;
- kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+ r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+ if (r < 0)
+ break;
r = -EFAULT;
if (copy_to_user(argp, u.xcrs,
@@ -6267,6 +6270,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
#endif
case KVM_GET_SREGS2: {
+ r = -EINVAL;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ goto out;
+
u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
r = -ENOMEM;
if (!u.sregs2)
@@ -6279,6 +6287,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_SREGS2: {
+ r = -EINVAL;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ goto out;
+
u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
if (IS_ERR(u.sregs2)) {
r = PTR_ERR(u.sregs2);
@@ -6314,14 +6327,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
if (addr > (unsigned int)(-3 * PAGE_SIZE))
return -EINVAL;
- ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
+ ret = kvm_x86_call(set_tss_addr)(kvm, addr);
return ret;
}
static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
u64 ident_addr)
{
- return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
+ return kvm_x86_call(set_identity_map_addr)(kvm, ident_addr);
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
@@ -6527,9 +6540,6 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
goto split_irqchip_unlock;
if (kvm->created_vcpus)
goto split_irqchip_unlock;
- r = kvm_setup_empty_irq_routing(kvm);
- if (r)
- goto split_irqchip_unlock;
/* Pairs with irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
@@ -6634,14 +6644,14 @@ split_irqchip_unlock:
if (!kvm_x86_ops.vm_copy_enc_context_from)
break;
- r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]);
+ r = kvm_x86_call(vm_copy_enc_context_from)(kvm, cap->args[0]);
break;
case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
r = -EINVAL;
if (!kvm_x86_ops.vm_move_enc_context_from)
break;
- r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]);
+ r = kvm_x86_call(vm_move_enc_context_from)(kvm, cap->args[0]);
break;
case KVM_CAP_EXIT_HYPERCALL:
if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
@@ -6676,7 +6686,9 @@ split_irqchip_unlock:
break;
mutex_lock(&kvm->lock);
- if (kvm->arch.max_vcpu_ids == cap->args[0]) {
+ if (kvm->arch.bsp_vcpu_id > cap->args[0]) {
+ ;
+ } else if (kvm->arch.max_vcpu_ids == cap->args[0]) {
r = 0;
} else if (!kvm->arch.max_vcpu_ids) {
kvm->arch.max_vcpu_ids = cap->args[0];
@@ -6729,6 +6741,30 @@ split_irqchip_unlock:
}
mutex_unlock(&kvm->lock);
break;
+ case KVM_CAP_X86_APIC_BUS_CYCLES_NS: {
+ u64 bus_cycle_ns = cap->args[0];
+ u64 unused;
+
+ /*
+ * Guard against overflow in tmict_to_ns(). 128 is the highest
+ * divide value that can be programmed in APIC_TDCR.
+ */
+ r = -EINVAL;
+ if (!bus_cycle_ns ||
+ check_mul_overflow((u64)U32_MAX * 128, bus_cycle_ns, &unused))
+ break;
+
+ r = 0;
+ mutex_lock(&kvm->lock);
+ if (!irqchip_in_kernel(kvm))
+ r = -ENXIO;
+ else if (kvm->created_vcpus)
+ r = -EINVAL;
+ else
+ kvm->arch.apic_bus_cycle_ns = bus_cycle_ns;
+ mutex_unlock(&kvm->lock);
+ break;
+ }
default:
r = -EINVAL;
break;
@@ -7197,6 +7233,9 @@ set_pit2_out:
mutex_lock(&kvm->lock);
if (kvm->created_vcpus)
r = -EBUSY;
+ else if (arg > KVM_MAX_VCPU_IDS ||
+ (kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids))
+ r = -EINVAL;
else
kvm->arch.bsp_vcpu_id = arg;
mutex_unlock(&kvm->lock);
@@ -7273,7 +7312,7 @@ set_pit2_out:
if (!kvm_x86_ops.mem_enc_ioctl)
goto out;
- r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp);
+ r = kvm_x86_call(mem_enc_ioctl)(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -7287,7 +7326,7 @@ set_pit2_out:
if (!kvm_x86_ops.mem_enc_register_region)
goto out;
- r = static_call(kvm_x86_mem_enc_register_region)(kvm, &region);
+ r = kvm_x86_call(mem_enc_register_region)(kvm, &region);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -7301,7 +7340,7 @@ set_pit2_out:
if (!kvm_x86_ops.mem_enc_unregister_region)
goto out;
- r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
+ r = kvm_x86_call(mem_enc_unregister_region)(kvm, &region);
break;
}
#ifdef CONFIG_KVM_HYPERV
@@ -7337,11 +7376,9 @@ out:
static void kvm_probe_feature_msr(u32 msr_index)
{
- struct kvm_msr_entry msr = {
- .index = msr_index,
- };
+ u64 data;
- if (kvm_get_msr_feature(&msr))
+ if (kvm_get_feature_msr(NULL, msr_index, &data, true))
return;
msr_based_features[num_msr_based_features++] = msr_index;
@@ -7395,17 +7432,20 @@ static void kvm_probe_msr_to_save(u32 msr_index)
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2))
return;
break;
- case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
+ case MSR_ARCH_PERFMON_PERFCTR0 ...
+ MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1:
if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
kvm_pmu_cap.num_counters_gp)
return;
break;
- case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
+ case MSR_ARCH_PERFMON_EVENTSEL0 ...
+ MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1:
if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
kvm_pmu_cap.num_counters_gp)
return;
break;
- case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX:
+ case MSR_ARCH_PERFMON_FIXED_CTR0 ...
+ MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1:
if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
kvm_pmu_cap.num_counters_fixed)
return;
@@ -7436,7 +7476,7 @@ static void kvm_init_msr_lists(void)
{
unsigned i;
- BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
+ BUILD_BUG_ON_MSG(KVM_MAX_NR_FIXED_COUNTERS != 3,
"Please update the fixed PMCs in msrs_to_save_pmu[]");
num_msrs_to_save = 0;
@@ -7452,7 +7492,8 @@ static void kvm_init_msr_lists(void)
}
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
- if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i]))
+ if (!kvm_x86_call(has_emulated_msr)(NULL,
+ emulated_msrs_all[i]))
continue;
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
@@ -7511,13 +7552,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
void kvm_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- static_call(kvm_x86_set_segment)(vcpu, var, seg);
+ kvm_x86_call(set_segment)(vcpu, var, seg);
}
void kvm_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- static_call(kvm_x86_get_segment)(vcpu, var, seg);
+ kvm_x86_call(get_segment)(vcpu, var, seg);
}
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
@@ -7540,7 +7581,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
@@ -7550,7 +7591,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
@@ -7603,7 +7644,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
@@ -7628,7 +7669,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
/*
* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
@@ -7651,7 +7692,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
if (system)
access |= PFERR_IMPLICIT_ACCESS;
- else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
+ else if (kvm_x86_call(get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
@@ -7696,7 +7737,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
if (system)
access |= PFERR_IMPLICIT_ACCESS;
- else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
+ else if (kvm_x86_call(get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
@@ -7717,8 +7758,8 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
{
- return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type,
- insn, insn_len);
+ return kvm_x86_call(check_emulate_instruction)(vcpu, emul_type,
+ insn, insn_len);
}
int handle_ud(struct kvm_vcpu *vcpu)
@@ -7768,8 +7809,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
bool write)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
- | (write ? PFERR_WRITE_MASK : 0);
+ u64 access = ((kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ | (write ? PFERR_WRITE_MASK : 0);
/*
* currently PKRU is only applied to ept enabled guest so
@@ -8195,7 +8236,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
- return static_call(kvm_x86_get_segment_base)(vcpu, seg);
+ return kvm_x86_call(get_segment_base)(vcpu, seg);
}
static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
@@ -8208,7 +8249,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
if (!need_emulate_wbinvd(vcpu))
return X86EMUL_CONTINUE;
- if (static_call(kvm_x86_has_wbinvd_exit)()) {
+ if (kvm_x86_call(has_wbinvd_exit)()) {
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
@@ -8312,27 +8353,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
- return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt));
+ return kvm_x86_call(get_cpl)(emul_to_vcpu(ctxt));
}
static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt);
+ kvm_x86_call(get_gdt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt);
+ kvm_x86_call(get_idt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt);
+ kvm_x86_call(set_gdt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt);
+ kvm_x86_call(set_idt)(emul_to_vcpu(ctxt), dt);
}
static unsigned long emulator_get_cached_segment_base(
@@ -8479,8 +8520,8 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
{
- return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage,
- &ctxt->exception);
+ return kvm_x86_call(check_intercept)(emul_to_vcpu(ctxt), info, stage,
+ &ctxt->exception);
}
static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
@@ -8505,6 +8546,11 @@ static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
}
+static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_is_intel_compatible(emul_to_vcpu(ctxt));
+}
+
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
{
return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
@@ -8517,7 +8563,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
{
- static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
+ kvm_x86_call(set_nmi_mask)(emul_to_vcpu(ctxt), masked);
}
static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt)
@@ -8562,7 +8608,8 @@ static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt,
if (!kvm_x86_ops.get_untagged_addr)
return addr;
- return static_call(kvm_x86_get_untagged_addr)(emul_to_vcpu(ctxt), addr, flags);
+ return kvm_x86_call(get_untagged_addr)(emul_to_vcpu(ctxt),
+ addr, flags);
}
static const struct x86_emulate_ops emulate_ops = {
@@ -8603,6 +8650,7 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_movbe = emulator_guest_has_movbe,
.guest_has_fxsr = emulator_guest_has_fxsr,
.guest_has_rdpid = emulator_guest_has_rdpid,
+ .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible,
.set_nmi_mask = emulator_set_nmi_mask,
.is_smm = emulator_is_smm,
.is_guest_mode = emulator_is_guest_mode,
@@ -8614,7 +8662,7 @@ static const struct x86_emulate_ops emulate_ops = {
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
- u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ u32 int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
@@ -8625,7 +8673,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
if (int_shadow & mask)
mask = 0;
if (unlikely(int_shadow || mask)) {
- static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask);
+ kvm_x86_call(set_interrupt_shadow)(vcpu, mask);
if (!mask)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -8666,7 +8714,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int cs_db, cs_l;
- static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
ctxt->gpa_available = false;
ctxt->eflags = kvm_get_rflags(vcpu);
@@ -8722,9 +8770,8 @@ static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
*/
memset(&info, 0, sizeof(info));
- static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
- &info[2], (u32 *)&info[3],
- (u32 *)&info[4]);
+ kvm_x86_call(get_exit_info)(vcpu, (u32 *)&info[0], &info[1], &info[2],
+ (u32 *)&info[3], (u32 *)&info[4]);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
@@ -8801,7 +8848,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
kvm_queue_exception(vcpu, UD_VECTOR);
- if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
+ if (!is_guest_mode(vcpu) && kvm_x86_call(get_cpl)(vcpu) == 0) {
prepare_emulation_ctxt_failure_exit(vcpu);
return 0;
}
@@ -8809,60 +8856,13 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
return 1;
}
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- int emulation_type)
+static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu,
+ gpa_t cr2_or_gpa,
+ int emulation_type)
{
- gpa_t gpa = cr2_or_gpa;
- kvm_pfn_t pfn;
-
if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
return false;
- if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
- WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
- return false;
-
- if (!vcpu->arch.mmu->root_role.direct) {
- /*
- * Write permission should be allowed since only
- * write access need to be emulated.
- */
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
-
- /*
- * If the mapping is invalid in guest, let cpu retry
- * it to generate fault.
- */
- if (gpa == INVALID_GPA)
- return true;
- }
-
- /*
- * Do not retry the unhandleable instruction if it faults on the
- * readonly host memory, otherwise it will goto a infinite loop:
- * retry instruction -> write #PF -> emulation fail -> retry
- * instruction -> ...
- */
- pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
-
- /*
- * If the instruction failed on the error pfn, it can not be fixed,
- * report the error to userspace.
- */
- if (is_error_noslot_pfn(pfn))
- return false;
-
- kvm_release_pfn_clean(pfn);
-
- /*
- * If emulation may have been triggered by a write to a shadowed page
- * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
- * guest to let the CPU re-execute the instruction in the hope that the
- * CPU can cleanly execute the instruction that KVM failed to emulate.
- */
- if (vcpu->kvm->arch.indirect_shadow_pages)
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
-
/*
* If the failed instruction faulted on an access to page tables that
* are used to translate any part of the instruction, KVM can't resolve
@@ -8873,54 +8873,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* then zap the SPTE to unprotect the gfn, and then do it all over
* again. Report the error to userspace.
*/
- return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
-}
-
-static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
- gpa_t cr2_or_gpa, int emulation_type)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
-
- last_retry_eip = vcpu->arch.last_retry_eip;
- last_retry_addr = vcpu->arch.last_retry_addr;
+ if (emulation_type & EMULTYPE_WRITE_PF_TO_SP)
+ return false;
/*
- * If the emulation is caused by #PF and it is non-page_table
- * writing instruction, it means the VM-EXIT is caused by shadow
- * page protected, we can zap the shadow page and retry this
- * instruction directly.
- *
- * Note: if the guest uses a non-page-table modifying instruction
- * on the PDE that points to the instruction, then we will unmap
- * the instruction and go to an infinite loop. So, we cache the
- * last retried eip and the last fault address, if we meet the eip
- * and the address again, we can break out of the potential infinite
- * loop.
+ * If emulation may have been triggered by a write to a shadowed page
+ * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
+ * guest to let the CPU re-execute the instruction in the hope that the
+ * CPU can cleanly execute the instruction that KVM failed to emulate.
*/
- vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
-
- if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
- return false;
-
- if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
- WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
- return false;
-
- if (x86_page_table_writing_insn(ctxt))
- return false;
-
- if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
- return false;
-
- vcpu->arch.last_retry_eip = ctxt->eip;
- vcpu->arch.last_retry_addr = cr2_or_gpa;
-
- if (!vcpu->arch.mmu->root_role.direct)
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
-
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+ __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true);
+ /*
+ * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible
+ * all SPTEs were already zapped by a different task. The alternative
+ * is to report the error to userspace and likely terminate the guest,
+ * and the last_retry_{eip,addr} checks will prevent retrying the page
+ * fault indefinitely, i.e. there's nothing to lose by retrying.
+ */
return true;
}
@@ -8959,10 +8929,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
int r;
- r = static_call(kvm_x86_skip_emulated_instruction)(vcpu);
+ r = kvm_x86_call(skip_emulated_instruction)(vcpu);
if (unlikely(!r))
return 0;
@@ -8984,19 +8954,17 @@ EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
{
- u32 shadow;
-
if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
return true;
/*
- * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
- * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first
- * to avoid the relatively expensive CPUID lookup.
+ * Intel compatible CPUs inhibit code #DBs when MOV/POP SS blocking is
+ * active, but AMD compatible CPUs do not.
*/
- shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
- return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
- guest_cpuid_is_intel(vcpu);
+ if (!guest_cpuid_is_intel_compatible(vcpu))
+ return false;
+
+ return kvm_x86_call(get_interrupt_shadow)(vcpu) & KVM_X86_SHADOW_INT_MOV_SS;
}
static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
@@ -9122,6 +9090,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
bool writeback = true;
+ if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) &&
+ (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))))
+ emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF;
+
r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len);
if (r != X86EMUL_CONTINUE) {
if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
@@ -9152,8 +9125,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- if (reexecute_instruction(vcpu, cr2_or_gpa,
- emulation_type))
+ if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ emulation_type))
return 1;
if (ctxt->have_exception &&
@@ -9200,7 +9173,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
return 1;
}
- if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
+ /*
+ * If emulation was caused by a write-protection #PF on a non-page_table
+ * writing instruction, try to unprotect the gfn, i.e. zap shadow pages,
+ * and retry the instruction, as the vCPU is likely no longer using the
+ * gfn as a page table.
+ */
+ if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) &&
+ !x86_page_table_writing_insn(ctxt) &&
+ kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
return 1;
/* this is needed for vmware backdoor interface to work since it
@@ -9231,7 +9212,8 @@ restart:
return 1;
if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
+ if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ emulation_type))
return 1;
return handle_emulation_failure(vcpu, emulation_type);
@@ -9268,7 +9250,7 @@ restart:
writeback:
if (writeback) {
- unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ unsigned long rflags = kvm_x86_call(get_rflags)(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
@@ -9285,7 +9267,7 @@ writeback:
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
- static_call_cond(kvm_x86_update_emulated_instruction)(vcpu);
+ kvm_x86_call(update_emulated_instruction)(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -9684,7 +9666,7 @@ static int kvm_x86_check_processor_compatibility(void)
__cr4_reserved_bits(cpu_has, &boot_cpu_data))
return -EIO;
- return static_call(kvm_x86_check_processor_compatibility)();
+ return kvm_x86_call(check_processor_compatibility)();
}
static void kvm_x86_check_cpu_compat(void *ret)
@@ -9699,7 +9681,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
guard(mutex)(&vendor_module_lock);
- if (kvm_x86_ops.hardware_enable) {
+ if (kvm_x86_ops.enable_virtualization_cpu) {
pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
}
@@ -9732,6 +9714,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
return -EIO;
}
+ memset(&kvm_caps, 0, sizeof(kvm_caps));
+
x86_emulator_cache = kvm_alloc_emulator_cache();
if (!x86_emulator_cache) {
pr_err("failed to allocate cache for x86 emulator\n");
@@ -9750,20 +9734,23 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (r)
goto out_free_percpu;
+ kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM);
+ kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
+
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
- host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
- kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+ kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0;
}
- rdmsrl_safe(MSR_EFER, &host_efer);
+ rdmsrl_safe(MSR_EFER, &kvm_host.efer);
if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
+ rdmsrl(MSR_IA32_XSS, kvm_host.xss);
kvm_init_pmu_capability(ops->pmu_ops);
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities);
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities);
r = ops->hardware_setup();
if (r != 0)
@@ -9795,6 +9782,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled)
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM);
+
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
kvm_caps.supported_xss = 0;
@@ -9818,8 +9808,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
return 0;
out_unwind_ops:
- kvm_x86_ops.hardware_enable = NULL;
- static_call(kvm_x86_hardware_unsetup)();
+ kvm_x86_ops.enable_virtualization_cpu = NULL;
+ kvm_x86_call(hardware_unsetup)();
out_mmu_exit:
kvm_mmu_vendor_module_exit();
out_free_percpu:
@@ -9850,7 +9840,7 @@ void kvm_x86_vendor_exit(void)
irq_work_sync(&pvclock_irq_work);
cancel_work_sync(&pvclock_gtod_work);
#endif
- static_call(kvm_x86_hardware_unsetup)();
+ kvm_x86_call(hardware_unsetup)();
kvm_mmu_vendor_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
@@ -9859,56 +9849,11 @@ void kvm_x86_vendor_exit(void)
WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
#endif
mutex_lock(&vendor_module_lock);
- kvm_x86_ops.hardware_enable = NULL;
+ kvm_x86_ops.enable_virtualization_cpu = NULL;
mutex_unlock(&vendor_module_lock);
}
EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
-static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
-{
- /*
- * The vCPU has halted, e.g. executed HLT. Update the run state if the
- * local APIC is in-kernel, the run loop will detect the non-runnable
- * state and halt the vCPU. Exit to userspace if the local APIC is
- * managed by userspace, in which case userspace is responsible for
- * handling wake events.
- */
- ++vcpu->stat.halt_exits;
- if (lapic_in_kernel(vcpu)) {
- vcpu->arch.mp_state = state;
- return 1;
- } else {
- vcpu->run->exit_reason = reason;
- return 0;
- }
-}
-
-int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
-{
- return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
-
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
-{
- int ret = kvm_skip_emulated_instruction(vcpu);
- /*
- * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
- * KVM_EXIT_DEBUG here.
- */
- return kvm_emulate_halt_noskip(vcpu) && ret;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-
-int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
-{
- int ret = kvm_skip_emulated_instruction(vcpu);
-
- return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
- KVM_EXIT_AP_RESET_HOLD) && ret;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
-
#ifdef CONFIG_X86_64
static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
unsigned long clock_type)
@@ -9976,7 +9921,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated);
bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
{
ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
- ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
+ ulong vcpu_reasons =
+ kvm_x86_call(vcpu_get_apicv_inhibit_reasons)(vcpu);
return (vm_reasons | vcpu_reasons) == 0;
}
@@ -9985,6 +9931,10 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
enum kvm_apicv_inhibit reason, bool set)
{
+ const struct trace_print_flags apicv_inhibits[] = { APICV_INHIBIT_REASONS };
+
+ BUILD_BUG_ON(ARRAY_SIZE(apicv_inhibits) != NR_APICV_INHIBIT_REASONS);
+
if (set)
__set_bit(reason, inhibits);
else
@@ -9995,15 +9945,12 @@ static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
static void kvm_apicv_init(struct kvm *kvm)
{
- unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons;
-
- init_rwsem(&kvm->arch.apicv_update_lock);
+ enum kvm_apicv_inhibit reason = enable_apicv ? APICV_INHIBIT_REASON_ABSENT :
+ APICV_INHIBIT_REASON_DISABLED;
- set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true);
+ set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true);
- if (!enable_apicv)
- set_or_clear_apicv_inhibit(inhibits,
- APICV_INHIBIT_REASON_DISABLE, true);
+ init_rwsem(&kvm->arch.apicv_update_lock);
}
static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
@@ -10051,26 +9998,15 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ int op_64_bit, int cpl)
{
- unsigned long nr, a0, a1, a2, a3, ret;
- int op_64_bit;
-
- if (kvm_xen_hypercall_enabled(vcpu->kvm))
- return kvm_xen_hypercall(vcpu);
-
- if (kvm_hv_hypercall_enabled(vcpu))
- return kvm_hv_hypercall(vcpu);
-
- nr = kvm_rax_read(vcpu);
- a0 = kvm_rbx_read(vcpu);
- a1 = kvm_rcx_read(vcpu);
- a2 = kvm_rdx_read(vcpu);
- a3 = kvm_rsi_read(vcpu);
+ unsigned long ret;
trace_kvm_hypercall(nr, a0, a1, a2, a3);
- op_64_bit = is_64_bit_hypercall(vcpu);
if (!op_64_bit) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
@@ -10079,7 +10015,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 &= 0xFFFFFFFF;
}
- if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
+ if (cpl) {
ret = -KVM_EPERM;
goto out;
}
@@ -10140,18 +10076,49 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+ /* stat is incremented on completion. */
return 0;
}
default:
ret = -KVM_ENOSYS;
break;
}
+
out:
+ ++vcpu->stat.hypercalls;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+ unsigned long nr, a0, a1, a2, a3, ret;
+ int op_64_bit;
+ int cpl;
+
+ if (kvm_xen_hypercall_enabled(vcpu->kvm))
+ return kvm_xen_hypercall(vcpu);
+
+ if (kvm_hv_hypercall_enabled(vcpu))
+ return kvm_hv_hypercall(vcpu);
+
+ nr = kvm_rax_read(vcpu);
+ a0 = kvm_rbx_read(vcpu);
+ a1 = kvm_rcx_read(vcpu);
+ a2 = kvm_rdx_read(vcpu);
+ a3 = kvm_rsi_read(vcpu);
+ op_64_bit = is_64_bit_hypercall(vcpu);
+ cpl = kvm_x86_call(get_cpl)(vcpu);
+
+ ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
+ if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
+ /* MAP_GPA tosses the request to the user space. */
+ return 0;
+
if (!op_64_bit)
ret = (u32)ret;
kvm_rax_write(vcpu, ret);
- ++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
@@ -10173,7 +10140,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
return X86EMUL_PROPAGATE_FAULT;
}
- static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
+ kvm_x86_call(patch_hypercall)(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3,
&ctxt->exception);
@@ -10190,7 +10157,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
- kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
+ kvm_run->if_flag = kvm_x86_call(get_if_flag)(vcpu);
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
@@ -10200,6 +10167,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
if (is_smm(vcpu))
kvm_run->flags |= KVM_RUN_X86_SMM;
+ if (is_guest_mode(vcpu))
+ kvm_run->flags |= KVM_RUN_X86_GUEST_MODE;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -10225,7 +10194,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
tpr = kvm_lapic_get_cr8(vcpu);
- static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
+ kvm_x86_call(update_cr8_intercept)(vcpu, tpr, max_irr);
}
@@ -10255,7 +10224,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
vcpu->arch.exception.error_code,
vcpu->arch.exception.injected);
- static_call(kvm_x86_inject_exception)(vcpu);
+ kvm_x86_call(inject_exception)(vcpu);
}
/*
@@ -10341,9 +10310,9 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
else if (kvm_is_exception_pending(vcpu))
; /* see above */
else if (vcpu->arch.nmi_injected)
- static_call(kvm_x86_inject_nmi)(vcpu);
+ kvm_x86_call(inject_nmi)(vcpu);
else if (vcpu->arch.interrupt.injected)
- static_call(kvm_x86_inject_irq)(vcpu, true);
+ kvm_x86_call(inject_irq)(vcpu, true);
/*
* Exceptions that morph to VM-Exits are handled above, and pending
@@ -10428,7 +10397,8 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
*/
#ifdef CONFIG_KVM_SMM
if (vcpu->arch.smi_pending) {
- r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
+ r = can_inject ? kvm_x86_call(smi_allowed)(vcpu, true) :
+ -EBUSY;
if (r < 0)
goto out;
if (r) {
@@ -10437,27 +10407,29 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
enter_smm(vcpu);
can_inject = false;
} else
- static_call(kvm_x86_enable_smi_window)(vcpu);
+ kvm_x86_call(enable_smi_window)(vcpu);
}
#endif
if (vcpu->arch.nmi_pending) {
- r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
+ r = can_inject ? kvm_x86_call(nmi_allowed)(vcpu, true) :
+ -EBUSY;
if (r < 0)
goto out;
if (r) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
- static_call(kvm_x86_inject_nmi)(vcpu);
+ kvm_x86_call(inject_nmi)(vcpu);
can_inject = false;
- WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
+ WARN_ON(kvm_x86_call(nmi_allowed)(vcpu, true) < 0);
}
if (vcpu->arch.nmi_pending)
- static_call(kvm_x86_enable_nmi_window)(vcpu);
+ kvm_x86_call(enable_nmi_window)(vcpu);
}
if (kvm_cpu_has_injectable_intr(vcpu)) {
- r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
+ r = can_inject ? kvm_x86_call(interrupt_allowed)(vcpu, true) :
+ -EBUSY;
if (r < 0)
goto out;
if (r) {
@@ -10465,17 +10437,17 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
if (!WARN_ON_ONCE(irq == -1)) {
kvm_queue_interrupt(vcpu, irq, false);
- static_call(kvm_x86_inject_irq)(vcpu, false);
- WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
+ kvm_x86_call(inject_irq)(vcpu, false);
+ WARN_ON(kvm_x86_call(interrupt_allowed)(vcpu, true) < 0);
}
}
if (kvm_cpu_has_injectable_intr(vcpu))
- static_call(kvm_x86_enable_irq_window)(vcpu);
+ kvm_x86_call(enable_irq_window)(vcpu);
}
if (is_guest_mode(vcpu) &&
kvm_x86_ops.nested_ops->has_events &&
- kvm_x86_ops.nested_ops->has_events(vcpu))
+ kvm_x86_ops.nested_ops->has_events(vcpu, true))
*req_immediate_exit = true;
/*
@@ -10516,7 +10488,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
* blocks NMIs). KVM will immediately inject one of the two NMIs, and
* will request an NMI window to handle the second NMI.
*/
- if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
+ if (kvm_x86_call(get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
limit = 1;
else
limit = 2;
@@ -10525,14 +10497,14 @@ static void process_nmi(struct kvm_vcpu *vcpu)
* Adjust the limit to account for pending virtual NMIs, which aren't
* tracked in vcpu->arch.nmi_pending.
*/
- if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
+ if (kvm_x86_call(is_vnmi_pending)(vcpu))
limit--;
vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
if (vcpu->arch.nmi_pending &&
- (static_call(kvm_x86_set_vnmi_pending)(vcpu)))
+ (kvm_x86_call(set_vnmi_pending)(vcpu)))
vcpu->arch.nmi_pending--;
if (vcpu->arch.nmi_pending)
@@ -10543,7 +10515,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu)
{
return vcpu->arch.nmi_pending +
- static_call(kvm_x86_is_vnmi_pending)(vcpu);
+ kvm_x86_call(is_vnmi_pending)(vcpu);
}
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
@@ -10577,7 +10549,7 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
apic->apicv_active = activate;
kvm_apic_update_apicv(vcpu);
- static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
+ kvm_x86_call(refresh_apicv_exec_ctrl)(vcpu);
/*
* When APICv gets disabled, we may still have injected interrupts
@@ -10677,13 +10649,12 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
+
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
- else {
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
- if (ioapic_in_kernel(vcpu->kvm))
- kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
- }
+ else if (ioapic_in_kernel(vcpu->kvm))
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
if (is_guest_mode(vcpu))
vcpu->arch.load_eoi_exitmap_pending = true;
@@ -10703,17 +10674,17 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
bitmap_or((ulong *)eoi_exit_bitmap,
vcpu->arch.ioapic_handled_vectors,
to_hv_synic(vcpu)->vec_bitmap, 256);
- static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+ kvm_x86_call(load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
return;
}
#endif
- static_call_cond(kvm_x86_load_eoi_exitmap)(
+ kvm_x86_call(load_eoi_exitmap)(
vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
}
void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
{
- static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
+ kvm_x86_call(guest_memory_reclaimed)(kvm);
}
static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
@@ -10721,7 +10692,7 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
- static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
+ kvm_x86_call(set_apic_access_page_addr)(vcpu);
}
/*
@@ -10885,10 +10856,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
- static_call(kvm_x86_msr_filter_changed)(vcpu);
+ kvm_x86_call(msr_filter_changed)(vcpu);
if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
- static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+ kvm_x86_call(update_cpu_dirty_logging)(vcpu);
+
+ if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) {
+ kvm_vcpu_reset(vcpu, true);
+ if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) {
+ r = 1;
+ goto out;
+ }
+ }
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
@@ -10910,7 +10889,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
if (req_int_win)
- static_call(kvm_x86_enable_irq_window)(vcpu);
+ kvm_x86_call(enable_irq_window)(vcpu);
if (kvm_lapic_enabled(vcpu)) {
update_cr8_intercept(vcpu);
@@ -10925,7 +10904,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
- static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
+ kvm_x86_call(prepare_switch_to_guest)(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -10961,7 +10940,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* i.e. they can post interrupts even if APICv is temporarily disabled.
*/
if (kvm_lapic_enabled(vcpu))
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -11005,12 +10984,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
+ exit_fastpath = kvm_x86_call(vcpu_run)(vcpu,
+ req_immediate_exit);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
if (kvm_lapic_enabled(vcpu))
- static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
+ kvm_x86_call(sync_pir_to_irr)(vcpu);
if (unlikely(kvm_vcpu_exit_request(vcpu))) {
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
@@ -11029,7 +11009,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
- static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
+ kvm_x86_call(sync_dirty_debug_regs)(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
}
@@ -11058,7 +11038,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.xfd_no_write_intercept)
fpu_sync_guest_vmexit_xfd_state();
- static_call(kvm_x86_handle_exit_irqoff)(vcpu);
+ kvm_x86_call(handle_exit_irqoff)(vcpu);
if (vcpu->arch.guest_fpu.xfd_err)
wrmsrl(MSR_IA32_XFD_ERR, 0);
@@ -11091,6 +11071,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_vcpu_srcu_read_lock(vcpu);
/*
+ * Call this to ensure WC buffers in guest are evicted after each VM
+ * Exit, so that the evicted WC writes can be snooped across all cpus
+ */
+ smp_mb__after_srcu_read_lock();
+
+ /*
* Profile KVM exit RIPs:
*/
if (unlikely(prof_on == KVM_PROFILING)) {
@@ -11104,19 +11090,83 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
- r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
+ if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE))
+ return 0;
+
+ r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath);
return r;
cancel_injection:
if (req_immediate_exit)
kvm_make_request(KVM_REQ_EVENT, vcpu);
- static_call(kvm_x86_cancel_injection)(vcpu);
+ kvm_x86_call(cancel_injection)(vcpu);
if (unlikely(vcpu->arch.apic_attention))
kvm_lapic_sync_from_vapic(vcpu);
out:
return r;
}
+static bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted);
+}
+
+static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
+{
+ if (!list_empty_careful(&vcpu->async_pf.done))
+ return true;
+
+ if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
+ kvm_apic_init_sipi_allowed(vcpu))
+ return true;
+
+ if (vcpu->arch.pv.pv_unhalted)
+ return true;
+
+ if (kvm_is_exception_pending(vcpu))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ (vcpu->arch.nmi_pending &&
+ kvm_x86_call(nmi_allowed)(vcpu, false)))
+ return true;
+
+#ifdef CONFIG_KVM_SMM
+ if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ (vcpu->arch.smi_pending &&
+ kvm_x86_call(smi_allowed)(vcpu, false)))
+ return true;
+#endif
+
+ if (kvm_test_request(KVM_REQ_PMI, vcpu))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
+ return true;
+
+ if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
+ return true;
+
+ if (kvm_hv_has_stimer_pending(vcpu))
+ return true;
+
+ if (is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->has_events &&
+ kvm_x86_ops.nested_ops->has_events(vcpu, false))
+ return true;
+
+ if (kvm_xen_has_pending_events(vcpu))
+ return true;
+
+ return false;
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+}
+
/* Called within kvm->srcu read side. */
static inline int vcpu_block(struct kvm_vcpu *vcpu)
{
@@ -11160,7 +11210,10 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
* causes a spurious wakeup from HLT).
*/
if (is_guest_mode(vcpu)) {
- if (kvm_check_nested_events(vcpu) < 0)
+ int r = kvm_check_nested_events(vcpu);
+
+ WARN_ON_ONCE(r == -EBUSY);
+ if (r < 0)
return 0;
}
@@ -11185,19 +11238,12 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
return 1;
}
-static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
-{
- return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
- !vcpu->arch.apf.halted);
-}
-
/* Called within kvm->srcu read side. */
static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
/*
@@ -11243,6 +11289,98 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
return r;
}
+static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
+{
+ /*
+ * The vCPU has halted, e.g. executed HLT. Update the run state if the
+ * local APIC is in-kernel, the run loop will detect the non-runnable
+ * state and halt the vCPU. Exit to userspace if the local APIC is
+ * managed by userspace, in which case userspace is responsible for
+ * handling wake events.
+ */
+ ++vcpu->stat.halt_exits;
+ if (lapic_in_kernel(vcpu)) {
+ if (kvm_vcpu_has_events(vcpu))
+ vcpu->arch.pv.pv_unhalted = false;
+ else
+ vcpu->arch.mp_state = state;
+ return 1;
+ } else {
+ vcpu->run->exit_reason = reason;
+ return 0;
+ }
+}
+
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
+{
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+ /*
+ * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ return kvm_emulate_halt_noskip(vcpu) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+
+fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ kvm_vcpu_srcu_read_lock(vcpu);
+ ret = kvm_emulate_halt(vcpu);
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ if (!ret)
+ return EXIT_FASTPATH_EXIT_USERSPACE;
+
+ if (kvm_vcpu_running(vcpu))
+ return EXIT_FASTPATH_REENTER_GUEST;
+
+ return EXIT_FASTPATH_EXIT_HANDLED;
+}
+EXPORT_SYMBOL_GPL(handle_fastpath_hlt);
+
+int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
+ KVM_EXIT_AP_RESET_HOLD) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
+
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_apicv_active(vcpu) &&
+ kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu);
+}
+
+bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.preempted_in_kernel;
+}
+
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+ if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+#ifdef CONFIG_KVM_SMM
+ kvm_test_request(KVM_REQ_SMI, vcpu) ||
+#endif
+ kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return true;
+
+ return kvm_arch_dy_has_pending_interrupt(vcpu);
+}
+
static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
{
return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
@@ -11347,7 +11485,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_vcpu_srcu_read_lock(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
- if (kvm_run->immediate_exit) {
+ if (!vcpu->wants_to_run) {
r = -EINTR;
goto out;
}
@@ -11425,12 +11563,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(vcpu->mmio_needed);
}
- if (kvm_run->immediate_exit) {
+ if (!vcpu->wants_to_run) {
r = -EINTR;
goto out;
}
- r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
+ r = kvm_x86_call(vcpu_pre_run)(vcpu);
if (r <= 0)
goto out;
@@ -11486,6 +11624,10 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
__get_regs(vcpu, regs);
vcpu_put(vcpu);
@@ -11527,6 +11669,10 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
__set_regs(vcpu, regs);
vcpu_put(vcpu);
@@ -11550,10 +11696,10 @@ static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
- static_call(kvm_x86_get_idt)(vcpu, &dt);
+ kvm_x86_call(get_idt)(vcpu, &dt);
sregs->idt.limit = dt.size;
sregs->idt.base = dt.address;
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ kvm_x86_call(get_gdt)(vcpu, &dt);
sregs->gdt.limit = dt.size;
sregs->gdt.base = dt.address;
@@ -11599,6 +11745,10 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
__get_sregs(vcpu, sregs);
vcpu_put(vcpu);
@@ -11691,7 +11841,13 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
has_error_code, error_code);
- if (ret) {
+
+ /*
+ * Report an error userspace if MMIO is needed, as KVM doesn't support
+ * MMIO during a task switch (or any other complex operation).
+ */
+ if (ret || vcpu->mmio_needed) {
+ vcpu->mmio_needed = false;
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
@@ -11749,27 +11905,27 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
+ kvm_x86_call(set_idt)(vcpu, &dt);
dt.size = sregs->gdt.limit;
dt.address = sregs->gdt.base;
- static_call(kvm_x86_set_gdt)(vcpu, &dt);
+ kvm_x86_call(set_gdt)(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
*mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
- static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3);
+ kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3);
kvm_set_cr8(vcpu, sregs->cr8);
*mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
- static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
+ kvm_x86_call(set_efer)(vcpu, sregs->efer);
*mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
- static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
+ kvm_x86_call(set_cr0)(vcpu, sregs->cr0);
*mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
- static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
+ kvm_x86_call(set_cr4)(vcpu, sregs->cr4);
if (update_pdptrs) {
idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -11866,6 +12022,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
{
int ret;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
ret = __set_sregs(vcpu, sregs);
vcpu_put(vcpu);
@@ -11943,7 +12103,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
*/
kvm_set_rflags(vcpu, rflags);
- static_call(kvm_x86_update_exception_bitmap)(vcpu);
+ kvm_x86_call(update_exception_bitmap)(vcpu);
kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
@@ -11983,7 +12143,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
struct fxregs_state *fxsave;
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
vcpu_load(vcpu);
@@ -12006,7 +12166,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
struct fxregs_state *fxsave;
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
vcpu_load(vcpu);
@@ -12080,7 +12240,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
if (id >= kvm->arch.max_vcpu_ids)
return -EINVAL;
- return static_call(kvm_x86_vcpu_precreate)(kvm);
+ return kvm_x86_call(vcpu_precreate)(kvm);
}
int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
@@ -12103,7 +12263,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (r < 0)
return r;
- r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ r = kvm_create_lapic(vcpu);
if (r < 0)
goto fail_mmu_destroy;
@@ -12137,8 +12297,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
- vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
-
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
@@ -12151,14 +12309,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.hv_root_tdp = INVALID_PAGE;
#endif
- r = static_call(kvm_x86_vcpu_create)(vcpu);
+ r = kvm_x86_call(vcpu_create)(vcpu);
if (r)
goto free_guest_fpu;
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_xen_init_vcpu(vcpu);
- kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
kvm_vcpu_reset(vcpu, false);
@@ -12209,7 +12366,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvmclock_reset(vcpu);
- static_call(kvm_x86_vcpu_free)(vcpu);
+ kvm_x86_call(vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
@@ -12305,6 +12462,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (!init_event) {
vcpu->arch.smbase = 0x30000;
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
vcpu->arch.msr_misc_features_enables = 0;
vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
@@ -12327,7 +12486,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1);
kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
- static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
+ kvm_x86_call(vcpu_reset)(vcpu, init_event);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0xfff0);
@@ -12346,10 +12505,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
else
new_cr0 |= X86_CR0_NW | X86_CR0_CD;
- static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
- static_call(kvm_x86_set_cr4)(vcpu, 0);
- static_call(kvm_x86_set_efer)(vcpu, 0);
- static_call(kvm_x86_update_exception_bitmap)(vcpu);
+ kvm_x86_call(set_cr0)(vcpu, new_cr0);
+ kvm_x86_call(set_cr4)(vcpu, 0);
+ kvm_x86_call(set_efer)(vcpu, 0);
+ kvm_x86_call(update_exception_bitmap)(vcpu);
/*
* On the standard CR0/CR4/EFER modification paths, there are several
@@ -12390,7 +12549,17 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
-int kvm_arch_hardware_enable(void)
+void kvm_arch_enable_virtualization(void)
+{
+ cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+}
+
+void kvm_arch_disable_virtualization(void)
+{
+ cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+}
+
+int kvm_arch_enable_virtualization_cpu(void)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
@@ -12406,7 +12575,7 @@ int kvm_arch_hardware_enable(void)
if (ret)
return ret;
- ret = static_call(kvm_x86_hardware_enable)();
+ ret = kvm_x86_call(enable_virtualization_cpu)();
if (ret != 0)
return ret;
@@ -12486,9 +12655,9 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
- static_call(kvm_x86_hardware_disable)();
+ kvm_x86_call(disable_virtualization_cpu)();
drop_user_return_notifiers();
}
@@ -12502,18 +12671,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
- struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
-
- vcpu->arch.l1tf_flush_l1d = true;
- if (pmu->version && unlikely(pmu->event_count)) {
- pmu->need_cleanup = true;
- kvm_make_request(KVM_REQ_PMU, vcpu);
- }
- static_call(kvm_x86_sched_in)(vcpu, cpu);
-}
-
void kvm_arch_free_vm(struct kvm *kvm)
{
#if IS_ENABLED(CONFIG_HYPERV)
@@ -12532,6 +12689,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return -EINVAL;
kvm->arch.vm_type = type;
+ kvm->arch.has_private_mem =
+ (type == KVM_X86_SW_PROTECTED_VM);
+ /* Decided by the vendor code for other VM types. */
+ kvm->arch.pre_fault_allowed =
+ type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
ret = kvm_page_track_init(kvm);
if (ret)
@@ -12539,7 +12701,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_mmu_init_vm(kvm);
- ret = static_call(kvm_x86_vm_init)(kvm);
+ ret = kvm_x86_call(vm_init)(kvm);
if (ret)
goto out_uninit_mmu;
@@ -12562,6 +12724,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
+ kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT;
kvm->arch.guest_can_read_msr_platform_info = true;
kvm->arch.enable_pmu = enable_pmu;
@@ -12713,7 +12876,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
mutex_unlock(&kvm->slots_lock);
}
kvm_unload_vcpu_mmus(kvm);
- static_call_cond(kvm_x86_vm_destroy)(kvm);
+ kvm_x86_call(vm_destroy)(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
@@ -12731,7 +12894,7 @@ static void memslot_rmap_free(struct kvm_memory_slot *slot)
int i;
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
- kvfree(slot->arch.rmap[i]);
+ vfree(slot->arch.rmap[i]);
slot->arch.rmap[i] = NULL;
}
}
@@ -12743,7 +12906,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
memslot_rmap_free(slot);
for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
- kvfree(slot->arch.lpage_info[i - 1]);
+ vfree(slot->arch.lpage_info[i - 1]);
slot->arch.lpage_info[i - 1] = NULL;
}
@@ -12835,7 +12998,7 @@ out_free:
memslot_rmap_free(slot);
for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
- kvfree(slot->arch.lpage_info[i - 1]);
+ vfree(slot->arch.lpage_info[i - 1]);
slot->arch.lpage_info[i - 1] = NULL;
}
return -ENOMEM;
@@ -13042,98 +13205,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_arch_free_memslot(kvm, old);
}
-static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- return (is_guest_mode(vcpu) &&
- static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
-}
-
-static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
-{
- if (!list_empty_careful(&vcpu->async_pf.done))
- return true;
-
- if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
- kvm_apic_init_sipi_allowed(vcpu))
- return true;
-
- if (vcpu->arch.pv.pv_unhalted)
- return true;
-
- if (kvm_is_exception_pending(vcpu))
- return true;
-
- if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
- (vcpu->arch.nmi_pending &&
- static_call(kvm_x86_nmi_allowed)(vcpu, false)))
- return true;
-
-#ifdef CONFIG_KVM_SMM
- if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
- (vcpu->arch.smi_pending &&
- static_call(kvm_x86_smi_allowed)(vcpu, false)))
- return true;
-#endif
-
- if (kvm_test_request(KVM_REQ_PMI, vcpu))
- return true;
-
- if (kvm_arch_interrupt_allowed(vcpu) &&
- (kvm_cpu_has_interrupt(vcpu) ||
- kvm_guest_apic_has_interrupt(vcpu)))
- return true;
-
- if (kvm_hv_has_stimer_pending(vcpu))
- return true;
-
- if (is_guest_mode(vcpu) &&
- kvm_x86_ops.nested_ops->has_events &&
- kvm_x86_ops.nested_ops->has_events(vcpu))
- return true;
-
- if (kvm_xen_has_pending_events(vcpu))
- return true;
-
- return false;
-}
-
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-{
- return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
-}
-
-bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
- return kvm_vcpu_apicv_active(vcpu) &&
- static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu);
-}
-
-bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.preempted_in_kernel;
-}
-
-bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
-{
- if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
- return true;
-
- if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
-#ifdef CONFIG_KVM_SMM
- kvm_test_request(KVM_REQ_SMI, vcpu) ||
-#endif
- kvm_test_request(KVM_REQ_EVENT, vcpu))
- return true;
-
- return kvm_arch_dy_has_pending_interrupt(vcpu);
-}
-
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.guest_state_protected)
return true;
- return static_call(kvm_x86_get_cpl)(vcpu) == 0;
+ return kvm_x86_call(get_cpl)(vcpu) == 0;
}
unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
@@ -13148,7 +13225,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
+ return kvm_x86_call(interrupt_allowed)(vcpu, false);
}
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
@@ -13174,7 +13251,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags;
- rflags = static_call(kvm_x86_get_rflags)(vcpu);
+ rflags = kvm_x86_call(get_rflags)(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
rflags &= ~X86_EFLAGS_TF;
return rflags;
@@ -13186,7 +13263,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
rflags |= X86_EFLAGS_TF;
- static_call(kvm_x86_set_rflags)(vcpu, rflags);
+ kvm_x86_call(set_rflags)(vcpu, rflags);
}
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
@@ -13298,7 +13375,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
return false;
if (vcpu->arch.apf.send_user_only &&
- static_call(kvm_x86_get_cpl)(vcpu) == 0)
+ kvm_x86_call(get_cpl)(vcpu) == 0)
return false;
if (is_guest_mode(vcpu)) {
@@ -13409,7 +13486,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
void kvm_arch_start_assignment(struct kvm *kvm)
{
if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
- static_call_cond(kvm_x86_pi_start_assignment)(kvm);
+ kvm_x86_call(pi_start_assignment)(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
@@ -13428,13 +13505,13 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
{
/*
- * Non-coherent DMA assignment and de-assignment will affect
- * whether KVM honors guest MTRRs and cause changes in memtypes
- * in TDP.
- * So, pass %true unconditionally to indicate non-coherent DMA was,
- * or will be involved, and that zapping SPTEs might be necessary.
+ * Non-coherent DMA assignment and de-assignment may affect whether or
+ * not KVM honors guest PAT, and thus may cause changes in EPT SPTEs
+ * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first
+ * (or last) non-coherent device is (un)registered to so that new SPTEs
+ * with the correct "ignore guest PAT" setting are created.
*/
- if (__kvm_mmu_honors_guest_mtrrs(true))
+ if (kvm_mmu_may_ignore_guest_pat())
kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
}
@@ -13472,9 +13549,8 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
- ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
-
+ ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
if (ret)
kvm_arch_end_assignment(irqfd->kvm);
@@ -13497,7 +13573,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
- ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
+ prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
@@ -13508,7 +13585,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set);
+ return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
}
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
@@ -13531,6 +13608,19 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
+int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
+{
+ return kvm_x86_call(gmem_prepare)(kvm, pfn, gfn, max_order);
+}
+#endif
+
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
+void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
+{
+ kvm_x86_call(gmem_invalidate)(start, end);
+}
+#endif
int kvm_spec_ctrl_test_value(u64 value)
{
@@ -13916,6 +14006,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault);
static int __init kvm_x86_init(void)
{
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a8b71803777b..a84c48ef5278 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -24,6 +24,8 @@ struct kvm_caps {
bool has_bus_lock_exit;
/* notify VM exit supported? */
bool has_notify_vmexit;
+ /* bit mask of VM types */
+ u32 supported_vm_types;
u64 supported_mce_cap;
u64 supported_xcr0;
@@ -31,6 +33,20 @@ struct kvm_caps {
u64 supported_perf_cap;
};
+struct kvm_host_values {
+ /*
+ * The host's raw MAXPHYADDR, i.e. the number of non-reserved physical
+ * address bits irrespective of features that repurpose legal bits,
+ * e.g. MKTME.
+ */
+ u8 maxphyaddr;
+
+ u64 efer;
+ u64 xcr0;
+ u64 xss;
+ u64 arch_capabilities;
+};
+
void kvm_spurious_fault(void);
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
@@ -87,11 +103,18 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
return max(val, min);
}
-#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+#define MSR_IA32_CR_PAT_DEFAULT \
+ PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC)
void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+/* Forcibly leave the nested mode in cases like a vCPU reset */
+static inline void kvm_leave_nested(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
+}
+
static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
{
return vcpu->arch.last_vmentry_cpu != -1;
@@ -157,7 +180,7 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
if (!is_long_mode(vcpu))
return false;
- static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
+ kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
return cs_l;
}
@@ -309,12 +332,8 @@ int handle_ud(struct kvm_vcpu *vcpu);
void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
struct kvm_queued_exception *ex);
-void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
-u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
- int page_num);
bool kvm_vector_hashing_enabled(void);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
@@ -322,12 +341,10 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
-
-extern u64 host_xcr0;
-extern u64 host_xss;
-extern u64 host_arch_capabilities;
+fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu);
extern struct kvm_caps kvm_caps;
+extern struct kvm_host_values kvm_host;
extern bool enable_pmu;
@@ -495,13 +512,26 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
+enum kvm_msr_access {
+ MSR_TYPE_R = BIT(0),
+ MSR_TYPE_W = BIT(1),
+ MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W,
+};
+
/*
* Internal error codes that are used to indicate that MSR emulation encountered
- * an error that should result in #GP in the guest, unless userspace
- * handles it.
+ * an error that should result in #GP in the guest, unless userspace handles it.
+ * Note, '1', '0', and negative numbers are off limits, as they are used by KVM
+ * as part of KVM's lightly documented internal KVM_RUN return codes.
+ *
+ * UNSUPPORTED - The MSR isn't supported, either because it is completely
+ * unknown to KVM, or because the MSR should not exist according
+ * to the vCPU model.
+ *
+ * FILTERED - Access to the MSR is denied by a userspace MSR filter.
*/
-#define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */
-#define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */
+#define KVM_MSR_RET_UNSUPPORTED 2
+#define KVM_MSR_RET_FILTERED 3
#define __cr4_reserved_bits(__cpu_has, __c) \
({ \
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index f65b35a05d91..622fe24da910 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -741,7 +741,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
} else {
void __user * hva = u64_to_user_ptr(data->u.shared_info.hva);
- if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) {
+ if (!PAGE_ALIGNED(hva)) {
r = -EINVAL;
} else if (!hva) {
kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
@@ -1270,7 +1270,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
instructions[0] = 0xb8;
/* vmcall / vmmcall */
- static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5);
+ kvm_x86_call(patch_hypercall)(vcpu, instructions + 5);
/* ret */
instructions[8] = 0xc3;
@@ -1650,7 +1650,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
params[5] = (u64)kvm_r9_read(vcpu);
}
#endif
- cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ cpl = kvm_x86_call(get_cpl)(vcpu);
trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2],
params[3], params[4], params[5]);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 6da73513f026..98583a9dbab3 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -53,7 +53,6 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += atomic64_386_32.o
endif
else
- obj-y += iomap_copy_64.o
ifneq ($(CONFIG_GENERIC_CSUM),y)
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
endif
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 90afb488b396..b2eff07d65e4 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -16,6 +16,11 @@
cmpxchg8b (\reg)
.endm
+.macro read64_nonatomic reg
+ movl (\reg), %eax
+ movl 4(\reg), %edx
+.endm
+
SYM_FUNC_START(atomic64_read_cx8)
read64 %ecx
RET
@@ -51,7 +56,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8)
movl %edx, %edi
movl %ecx, %ebp
- read64 %ecx
+ read64_nonatomic %ecx
1:
movl %eax, %ebx
movl %edx, %ecx
@@ -79,7 +84,7 @@ addsub_return sub sub sbb
SYM_FUNC_START(atomic64_\func\()_return_cx8)
pushl %ebx
- read64 %esi
+ read64_nonatomic %esi
1:
movl %eax, %ebx
movl %edx, %ecx
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 80570eb3c89b..c65cd5550454 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -6,8 +6,10 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/ctype.h>
+
#include <asm/setup.h>
#include <asm/cmdline.h>
+#include <asm/bug.h>
static inline int myisspace(u8 c)
{
@@ -205,12 +207,29 @@ __cmdline_find_option(const char *cmdline, int max_cmdline_size,
int cmdline_find_option_bool(const char *cmdline, const char *option)
{
- return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+ int ret;
+
+ ret = __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+ if (ret > 0)
+ return ret;
+
+ if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added)
+ return __cmdline_find_option_bool(builtin_cmdline, COMMAND_LINE_SIZE, option);
+
+ return ret;
}
int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
int bufsize)
{
- return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
- buffer, bufsize);
+ int ret;
+
+ ret = __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize);
+ if (ret > 0)
+ return ret;
+
+ if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && !builtin_cmdline_added)
+ return __cmdline_find_option(builtin_cmdline, COMMAND_LINE_SIZE, option, buffer, bufsize);
+
+ return ret;
}
diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c
index 6e8b7e600def..97e88e58567b 100644
--- a/arch/x86/lib/copy_mc.c
+++ b/arch/x86/lib/copy_mc.c
@@ -4,6 +4,7 @@
#include <linux/jump_label.h>
#include <linux/uaccess.h>
#include <linux/export.h>
+#include <linux/instrumented.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -61,10 +62,20 @@ unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned
*/
unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len)
{
- if (copy_mc_fragile_enabled)
- return copy_mc_fragile(dst, src, len);
- if (static_cpu_has(X86_FEATURE_ERMS))
- return copy_mc_enhanced_fast_string(dst, src, len);
+ unsigned long ret;
+
+ if (copy_mc_fragile_enabled) {
+ instrument_memcpy_before(dst, src, len);
+ ret = copy_mc_fragile(dst, src, len);
+ instrument_memcpy_after(dst, src, len, ret);
+ return ret;
+ }
+ if (static_cpu_has(X86_FEATURE_ERMS)) {
+ instrument_memcpy_before(dst, src, len);
+ ret = copy_mc_enhanced_fast_string(dst, src, len);
+ instrument_memcpy_after(dst, src, len, ret);
+ return ret;
+ }
memcpy(dst, src, len);
return 0;
}
@@ -75,6 +86,7 @@ unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, un
unsigned long ret;
if (copy_mc_fragile_enabled) {
+ instrument_copy_to_user(dst, src, len);
__uaccess_begin();
ret = copy_mc_fragile((__force void *)dst, src, len);
__uaccess_end();
@@ -82,6 +94,7 @@ unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, un
}
if (static_cpu_has(X86_FEATURE_ERMS)) {
+ instrument_copy_to_user(dst, src, len);
__uaccess_begin();
ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len);
__uaccess_end();
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 10d5ed8b5990..d066aecf8aeb 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -50,11 +50,17 @@
.endif
.endm
+.macro UACCESS op src dst
+1: \op \src,\dst
+ _ASM_EXTABLE_UA(1b, __get_user_handle_exception)
+.endm
+
+
.text
SYM_FUNC_START(__get_user_1)
check_range size=1
ASM_STAC
-1: movzbl (%_ASM_AX),%edx
+ UACCESS movzbl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
RET
@@ -64,7 +70,7 @@ EXPORT_SYMBOL(__get_user_1)
SYM_FUNC_START(__get_user_2)
check_range size=2
ASM_STAC
-2: movzwl (%_ASM_AX),%edx
+ UACCESS movzwl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
RET
@@ -74,7 +80,7 @@ EXPORT_SYMBOL(__get_user_2)
SYM_FUNC_START(__get_user_4)
check_range size=4
ASM_STAC
-3: movl (%_ASM_AX),%edx
+ UACCESS movl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
RET
@@ -82,13 +88,16 @@ SYM_FUNC_END(__get_user_4)
EXPORT_SYMBOL(__get_user_4)
SYM_FUNC_START(__get_user_8)
+#ifndef CONFIG_X86_64
+ xor %ecx,%ecx
+#endif
check_range size=8
ASM_STAC
#ifdef CONFIG_X86_64
-4: movq (%_ASM_AX),%rdx
+ UACCESS movq (%_ASM_AX),%rdx
#else
-4: movl (%_ASM_AX),%edx
-5: movl 4(%_ASM_AX),%ecx
+ UACCESS movl (%_ASM_AX),%edx
+ UACCESS movl 4(%_ASM_AX),%ecx
#endif
xor %eax,%eax
ASM_CLAC
@@ -100,7 +109,7 @@ EXPORT_SYMBOL(__get_user_8)
SYM_FUNC_START(__get_user_nocheck_1)
ASM_STAC
ASM_BARRIER_NOSPEC
-6: movzbl (%_ASM_AX),%edx
+ UACCESS movzbl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
RET
@@ -110,7 +119,7 @@ EXPORT_SYMBOL(__get_user_nocheck_1)
SYM_FUNC_START(__get_user_nocheck_2)
ASM_STAC
ASM_BARRIER_NOSPEC
-7: movzwl (%_ASM_AX),%edx
+ UACCESS movzwl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
RET
@@ -120,7 +129,7 @@ EXPORT_SYMBOL(__get_user_nocheck_2)
SYM_FUNC_START(__get_user_nocheck_4)
ASM_STAC
ASM_BARRIER_NOSPEC
-8: movl (%_ASM_AX),%edx
+ UACCESS movl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
RET
@@ -131,10 +140,11 @@ SYM_FUNC_START(__get_user_nocheck_8)
ASM_STAC
ASM_BARRIER_NOSPEC
#ifdef CONFIG_X86_64
-9: movq (%_ASM_AX),%rdx
+ UACCESS movq (%_ASM_AX),%rdx
#else
-9: movl (%_ASM_AX),%edx
-10: movl 4(%_ASM_AX),%ecx
+ xor %ecx,%ecx
+ UACCESS movl (%_ASM_AX),%edx
+ UACCESS movl 4(%_ASM_AX),%ecx
#endif
xor %eax,%eax
ASM_CLAC
@@ -150,36 +160,3 @@ SYM_CODE_START_LOCAL(__get_user_handle_exception)
mov $(-EFAULT),%_ASM_AX
RET
SYM_CODE_END(__get_user_handle_exception)
-
-#ifdef CONFIG_X86_32
-SYM_CODE_START_LOCAL(__get_user_8_handle_exception)
- ASM_CLAC
-bad_get_user_8:
- xor %edx,%edx
- xor %ecx,%ecx
- mov $(-EFAULT),%_ASM_AX
- RET
-SYM_CODE_END(__get_user_8_handle_exception)
-#endif
-
-/* get_user */
- _ASM_EXTABLE_UA(1b, __get_user_handle_exception)
- _ASM_EXTABLE_UA(2b, __get_user_handle_exception)
- _ASM_EXTABLE_UA(3b, __get_user_handle_exception)
-#ifdef CONFIG_X86_64
- _ASM_EXTABLE_UA(4b, __get_user_handle_exception)
-#else
- _ASM_EXTABLE_UA(4b, __get_user_8_handle_exception)
- _ASM_EXTABLE_UA(5b, __get_user_8_handle_exception)
-#endif
-
-/* __get_user */
- _ASM_EXTABLE_UA(6b, __get_user_handle_exception)
- _ASM_EXTABLE_UA(7b, __get_user_handle_exception)
- _ASM_EXTABLE_UA(8b, __get_user_handle_exception)
-#ifdef CONFIG_X86_64
- _ASM_EXTABLE_UA(9b, __get_user_handle_exception)
-#else
- _ASM_EXTABLE_UA(9b, __get_user_8_handle_exception)
- _ASM_EXTABLE_UA(10b, __get_user_8_handle_exception)
-#endif
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 1bb155a0955b..6ffb931b9fb1 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -13,7 +13,7 @@
#endif
#include <asm/inat.h> /*__ignore_sync_check__ */
#include <asm/insn.h> /* __ignore_sync_check__ */
-#include <asm/unaligned.h> /* __ignore_sync_check__ */
+#include <linux/unaligned.h> /* __ignore_sync_check__ */
#include <linux/errno.h>
#include <linux/kconfig.h>
@@ -185,6 +185,17 @@ found:
if (X86_REX_W(b))
/* REX.W overrides opnd_size */
insn->opnd_bytes = 8;
+ } else if (inat_is_rex2_prefix(attr)) {
+ insn_set_byte(&insn->rex_prefix, 0, b);
+ b = peek_nbyte_next(insn_byte_t, insn, 1);
+ insn_set_byte(&insn->rex_prefix, 1, b);
+ insn->rex_prefix.nbytes = 2;
+ insn->next_byte += 2;
+ if (X86_REX_W(b))
+ /* REX.W overrides opnd_size */
+ insn->opnd_bytes = 8;
+ insn->rex_prefix.got = 1;
+ goto vex_end;
}
}
insn->rex_prefix.got = 1;
@@ -283,6 +294,10 @@ int insn_get_opcode(struct insn *insn)
m = insn_vex_m_bits(insn);
p = insn_vex_p_bits(insn);
insn->attr = inat_get_avx_attribute(op, m, p);
+ /* SCALABLE EVEX uses p bits to encode operand size */
+ if (inat_evex_scalable(insn->attr) && !insn_vex_w_bit(insn) &&
+ p == INAT_PFX_OPNDSZ)
+ insn->opnd_bytes = 2;
if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) ||
(!inat_accept_vex(insn->attr) &&
!inat_is_group(insn->attr))) {
@@ -294,6 +309,20 @@ int insn_get_opcode(struct insn *insn)
goto end;
}
+ /* Check if there is REX2 prefix or not */
+ if (insn_is_rex2(insn)) {
+ if (insn_rex2_m_bit(insn)) {
+ /* map 1 is escape 0x0f */
+ insn_attr_t esc_attr = inat_get_opcode_attribute(0x0f);
+
+ pfx_id = insn_last_prefix_id(insn);
+ insn->attr = inat_get_escape_attribute(op, pfx_id, esc_attr);
+ } else {
+ insn->attr = inat_get_opcode_attribute(op);
+ }
+ goto end;
+ }
+
insn->attr = inat_get_opcode_attribute(op);
while (inat_is_escape(insn->attr)) {
/* Get escaped opcode */
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
deleted file mode 100644
index 6ff2f56cb0f7..000000000000
--- a/arch/x86/lib/iomap_copy_64.S
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2006 PathScale, Inc. All Rights Reserved.
- */
-
-#include <linux/linkage.h>
-
-/*
- * override generic version in lib/iomap_copy.c
- */
-SYM_FUNC_START(__iowrite32_copy)
- movl %edx,%ecx
- rep movsl
- RET
-SYM_FUNC_END(__iowrite32_copy)
diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c
index e0411a3774d4..5eecb45d05d5 100644
--- a/arch/x86/lib/iomem.c
+++ b/arch/x86/lib/iomem.c
@@ -25,6 +25,9 @@ static __always_inline void rep_movs(void *to, const void *from, size_t n)
static void string_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
{
+ const void *orig_to = to;
+ const size_t orig_n = n;
+
if (unlikely(!n))
return;
@@ -39,7 +42,7 @@ static void string_memcpy_fromio(void *to, const volatile void __iomem *from, si
}
rep_movs(to, (const void *)from, n);
/* KMSAN must treat values read from devices as initialized. */
- kmsan_unpoison_memory(to, n);
+ kmsan_unpoison_memory(orig_to, orig_n);
}
static void string_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 12af572201a2..caedb3ef6688 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -23,6 +23,7 @@
#
# AVX Superscripts
# (ev): this opcode requires EVEX prefix.
+# (es): this opcode requires EVEX prefix and is SCALABALE.
# (evo): this opcode is changed by EVEX prefix (EVEX opcode)
# (v): this opcode requires VEX prefix.
# (v1): this opcode only supports 128bit VEX.
@@ -33,6 +34,10 @@
# - (F2): the last prefix is 0xF2
# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
# - (66&F2): Both 0x66 and 0xF2 prefixes are specified.
+#
+# REX2 Prefix
+# - (!REX2): REX2 is not allowed
+# - (REX2): REX2 variant e.g. JMPABS
Table: one byte opcode
Referrer:
@@ -148,7 +153,7 @@ AVXcode:
65: SEG=GS (Prefix)
66: Operand-Size (Prefix)
67: Address-Size (Prefix)
-68: PUSH Iz (d64)
+68: PUSH Iz
69: IMUL Gv,Ev,Iz
6a: PUSH Ib (d64)
6b: IMUL Gv,Ev,Ib
@@ -157,22 +162,22 @@ AVXcode:
6e: OUTS/OUTSB DX,Xb
6f: OUTS/OUTSW/OUTSD DX,Xz
# 0x70 - 0x7f
-70: JO Jb
-71: JNO Jb
-72: JB/JNAE/JC Jb
-73: JNB/JAE/JNC Jb
-74: JZ/JE Jb
-75: JNZ/JNE Jb
-76: JBE/JNA Jb
-77: JNBE/JA Jb
-78: JS Jb
-79: JNS Jb
-7a: JP/JPE Jb
-7b: JNP/JPO Jb
-7c: JL/JNGE Jb
-7d: JNL/JGE Jb
-7e: JLE/JNG Jb
-7f: JNLE/JG Jb
+70: JO Jb (!REX2)
+71: JNO Jb (!REX2)
+72: JB/JNAE/JC Jb (!REX2)
+73: JNB/JAE/JNC Jb (!REX2)
+74: JZ/JE Jb (!REX2)
+75: JNZ/JNE Jb (!REX2)
+76: JBE/JNA Jb (!REX2)
+77: JNBE/JA Jb (!REX2)
+78: JS Jb (!REX2)
+79: JNS Jb (!REX2)
+7a: JP/JPE Jb (!REX2)
+7b: JNP/JPO Jb (!REX2)
+7c: JL/JNGE Jb (!REX2)
+7d: JNL/JGE Jb (!REX2)
+7e: JLE/JNG Jb (!REX2)
+7f: JNLE/JG Jb (!REX2)
# 0x80 - 0x8f
80: Grp1 Eb,Ib (1A)
81: Grp1 Ev,Iz (1A)
@@ -208,24 +213,24 @@ AVXcode:
9e: SAHF
9f: LAHF
# 0xa0 - 0xaf
-a0: MOV AL,Ob
-a1: MOV rAX,Ov
-a2: MOV Ob,AL
-a3: MOV Ov,rAX
-a4: MOVS/B Yb,Xb
-a5: MOVS/W/D/Q Yv,Xv
-a6: CMPS/B Xb,Yb
-a7: CMPS/W/D Xv,Yv
-a8: TEST AL,Ib
-a9: TEST rAX,Iz
-aa: STOS/B Yb,AL
-ab: STOS/W/D/Q Yv,rAX
-ac: LODS/B AL,Xb
-ad: LODS/W/D/Q rAX,Xv
-ae: SCAS/B AL,Yb
+a0: MOV AL,Ob (!REX2)
+a1: MOV rAX,Ov (!REX2) | JMPABS O (REX2),(o64)
+a2: MOV Ob,AL (!REX2)
+a3: MOV Ov,rAX (!REX2)
+a4: MOVS/B Yb,Xb (!REX2)
+a5: MOVS/W/D/Q Yv,Xv (!REX2)
+a6: CMPS/B Xb,Yb (!REX2)
+a7: CMPS/W/D Xv,Yv (!REX2)
+a8: TEST AL,Ib (!REX2)
+a9: TEST rAX,Iz (!REX2)
+aa: STOS/B Yb,AL (!REX2)
+ab: STOS/W/D/Q Yv,rAX (!REX2)
+ac: LODS/B AL,Xb (!REX2)
+ad: LODS/W/D/Q rAX,Xv (!REX2)
+ae: SCAS/B AL,Yb (!REX2)
# Note: The May 2011 Intel manual shows Xv for the second parameter of the
# next instruction but Yv is correct
-af: SCAS/W/D/Q rAX,Yv
+af: SCAS/W/D/Q rAX,Yv (!REX2)
# 0xb0 - 0xbf
b0: MOV AL/R8L,Ib
b1: MOV CL/R9L,Ib
@@ -266,7 +271,7 @@ d1: Grp2 Ev,1 (1A)
d2: Grp2 Eb,CL (1A)
d3: Grp2 Ev,CL (1A)
d4: AAM Ib (i64)
-d5: AAD Ib (i64)
+d5: AAD Ib (i64) | REX2 (Prefix),(o64)
d6:
d7: XLAT/XLATB
d8: ESC
@@ -281,26 +286,26 @@ df: ESC
# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
-e0: LOOPNE/LOOPNZ Jb (f64)
-e1: LOOPE/LOOPZ Jb (f64)
-e2: LOOP Jb (f64)
-e3: JrCXZ Jb (f64)
-e4: IN AL,Ib
-e5: IN eAX,Ib
-e6: OUT Ib,AL
-e7: OUT Ib,eAX
+e0: LOOPNE/LOOPNZ Jb (f64) (!REX2)
+e1: LOOPE/LOOPZ Jb (f64) (!REX2)
+e2: LOOP Jb (f64) (!REX2)
+e3: JrCXZ Jb (f64) (!REX2)
+e4: IN AL,Ib (!REX2)
+e5: IN eAX,Ib (!REX2)
+e6: OUT Ib,AL (!REX2)
+e7: OUT Ib,eAX (!REX2)
# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset
# in "near" jumps and calls is 16-bit. For CALL,
# push of return address is 16-bit wide, RSP is decremented by 2
# but is not truncated to 16 bits, unlike RIP.
-e8: CALL Jz (f64)
-e9: JMP-near Jz (f64)
-ea: JMP-far Ap (i64)
-eb: JMP-short Jb (f64)
-ec: IN AL,DX
-ed: IN eAX,DX
-ee: OUT DX,AL
-ef: OUT DX,eAX
+e8: CALL Jz (f64) (!REX2)
+e9: JMP-near Jz (f64) (!REX2)
+ea: JMP-far Ap (i64) (!REX2)
+eb: JMP-short Jb (f64) (!REX2)
+ec: IN AL,DX (!REX2)
+ed: IN eAX,DX (!REX2)
+ee: OUT DX,AL (!REX2)
+ef: OUT DX,eAX (!REX2)
# 0xf0 - 0xff
f0: LOCK (Prefix)
f1:
@@ -386,14 +391,14 @@ AVXcode: 1
2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1)
2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1)
# 0x0f 0x30-0x3f
-30: WRMSR
-31: RDTSC
-32: RDMSR
-33: RDPMC
-34: SYSENTER
-35: SYSEXIT
+30: WRMSR (!REX2)
+31: RDTSC (!REX2)
+32: RDMSR (!REX2)
+33: RDPMC (!REX2)
+34: SYSENTER (!REX2)
+35: SYSEXIT (!REX2)
36:
-37: GETSEC
+37: GETSEC (!REX2)
38: escape # 3-byte escape 1
39:
3a: escape # 3-byte escape 2
@@ -473,22 +478,22 @@ AVXcode: 1
7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev)
# 0x0f 0x80-0x8f
# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
-80: JO Jz (f64)
-81: JNO Jz (f64)
-82: JB/JC/JNAE Jz (f64)
-83: JAE/JNB/JNC Jz (f64)
-84: JE/JZ Jz (f64)
-85: JNE/JNZ Jz (f64)
-86: JBE/JNA Jz (f64)
-87: JA/JNBE Jz (f64)
-88: JS Jz (f64)
-89: JNS Jz (f64)
-8a: JP/JPE Jz (f64)
-8b: JNP/JPO Jz (f64)
-8c: JL/JNGE Jz (f64)
-8d: JNL/JGE Jz (f64)
-8e: JLE/JNG Jz (f64)
-8f: JNLE/JG Jz (f64)
+80: JO Jz (f64) (!REX2)
+81: JNO Jz (f64) (!REX2)
+82: JB/JC/JNAE Jz (f64) (!REX2)
+83: JAE/JNB/JNC Jz (f64) (!REX2)
+84: JE/JZ Jz (f64) (!REX2)
+85: JNE/JNZ Jz (f64) (!REX2)
+86: JBE/JNA Jz (f64) (!REX2)
+87: JA/JNBE Jz (f64) (!REX2)
+88: JS Jz (f64) (!REX2)
+89: JNS Jz (f64) (!REX2)
+8a: JP/JPE Jz (f64) (!REX2)
+8b: JNP/JPO Jz (f64) (!REX2)
+8c: JL/JNGE Jz (f64) (!REX2)
+8d: JNL/JGE Jz (f64) (!REX2)
+8e: JLE/JNG Jz (f64) (!REX2)
+8f: JNLE/JG Jz (f64) (!REX2)
# 0x0f 0x90-0x9f
90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66)
91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66)
@@ -698,17 +703,17 @@ AVXcode: 2
4d: vrcp14ss/d Vsd,Hpd,Wsd (66),(ev)
4e: vrsqrt14ps/d Vpd,Wpd (66),(ev)
4f: vrsqrt14ss/d Vsd,Hsd,Wsd (66),(ev)
-50: vpdpbusd Vx,Hx,Wx (66),(ev)
-51: vpdpbusds Vx,Hx,Wx (66),(ev)
-52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66),(ev) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev)
-53: vpdpwssds Vx,Hx,Wx (66),(ev) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev)
+50: vpdpbusd Vx,Hx,Wx (66) | vpdpbssd Vx,Hx,Wx (F2),(v) | vpdpbsud Vx,Hx,Wx (F3),(v) | vpdpbuud Vx,Hx,Wx (v)
+51: vpdpbusds Vx,Hx,Wx (66) | vpdpbssds Vx,Hx,Wx (F2),(v) | vpdpbsuds Vx,Hx,Wx (F3),(v) | vpdpbuuds Vx,Hx,Wx (v)
+52: vdpbf16ps Vx,Hx,Wx (F3),(ev) | vpdpwssd Vx,Hx,Wx (66) | vp4dpwssd Vdqq,Hdqq,Wdq (F2),(ev)
+53: vpdpwssds Vx,Hx,Wx (66) | vp4dpwssds Vdqq,Hdqq,Wdq (F2),(ev)
54: vpopcntb/w Vx,Wx (66),(ev)
55: vpopcntd/q Vx,Wx (66),(ev)
58: vpbroadcastd Vx,Wx (66),(v)
59: vpbroadcastq Vx,Wx (66),(v) | vbroadcasti32x2 Vx,Wx (66),(evo)
5a: vbroadcasti128 Vqq,Mdq (66),(v) | vbroadcasti32x4/64x2 Vx,Wx (66),(evo)
5b: vbroadcasti32x8/64x4 Vqq,Mdq (66),(ev)
-5c: TDPBF16PS Vt,Wt,Ht (F3),(v1)
+5c: TDPBF16PS Vt,Wt,Ht (F3),(v1) | TDPFP16PS Vt,Wt,Ht (F2),(v1),(o64)
# Skip 0x5d
5e: TDPBSSD Vt,Wt,Ht (F2),(v1) | TDPBSUD Vt,Wt,Ht (F3),(v1) | TDPBUSD Vt,Wt,Ht (66),(v1) | TDPBUUD Vt,Wt,Ht (v1)
# Skip 0x5f-0x61
@@ -718,10 +723,12 @@ AVXcode: 2
65: vblendmps/d Vx,Hx,Wx (66),(ev)
66: vpblendmb/w Vx,Hx,Wx (66),(ev)
68: vp2intersectd/q Kx,Hx,Wx (F2),(ev)
-# Skip 0x69-0x6f
+# Skip 0x69-0x6b
+6c: TCMMIMFP16PS Vt,Wt,Ht (66),(v1),(o64) | TCMMRLFP16PS Vt,Wt,Ht (v1),(o64)
+# Skip 0x6d-0x6f
70: vpshldvw Vx,Hx,Wx (66),(ev)
71: vpshldvd/q Vx,Hx,Wx (66),(ev)
-72: vcvtne2ps2bf16 Vx,Hx,Wx (F2),(ev) | vcvtneps2bf16 Vx,Wx (F3),(ev) | vpshrdvw Vx,Hx,Wx (66),(ev)
+72: vcvtne2ps2bf16 Vx,Hx,Wx (F2),(ev) | vcvtneps2bf16 Vx,Wx (F3) | vpshrdvw Vx,Hx,Wx (66),(ev)
73: vpshrdvd/q Vx,Hx,Wx (66),(ev)
75: vpermi2b/w Vx,Hx,Wx (66),(ev)
76: vpermi2d/q Vx,Hx,Wx (66),(ev)
@@ -777,8 +784,10 @@ ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v)
ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v)
af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
-b4: vpmadd52luq Vx,Hx,Wx (66),(ev)
-b5: vpmadd52huq Vx,Hx,Wx (66),(ev)
+b0: vcvtneebf162ps Vx,Mx (F3),(!11B),(v) | vcvtneeph2ps Vx,Mx (66),(!11B),(v) | vcvtneobf162ps Vx,Mx (F2),(!11B),(v) | vcvtneoph2ps Vx,Mx (!11B),(v)
+b1: vbcstnebf162ps Vx,Mw (F3),(!11B),(v) | vbcstnesh2ps Vx,Mw (66),(!11B),(v)
+b4: vpmadd52luq Vx,Hx,Wx (66)
+b5: vpmadd52huq Vx,Hx,Wx (66)
b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v)
b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v)
b8: vfmadd231ps/d Vx,Hx,Wx (66),(v)
@@ -796,15 +805,35 @@ c7: Grp19 (1A)
c8: sha1nexte Vdq,Wdq | vexp2ps/d Vx,Wx (66),(ev)
c9: sha1msg1 Vdq,Wdq
ca: sha1msg2 Vdq,Wdq | vrcp28ps/d Vx,Wx (66),(ev)
-cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev)
-cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev)
-cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev)
+cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev) | vsha512rnds2 Vqq,Hqq,Udq (F2),(11B),(v)
+cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev) | vsha512msg1 Vqq,Udq (F2),(11B),(v)
+cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev) | vsha512msg2 Vqq,Uqq (F2),(11B),(v)
cf: vgf2p8mulb Vx,Wx (66)
+d2: vpdpwsud Vx,Hx,Wx (F3),(v) | vpdpwusd Vx,Hx,Wx (66),(v) | vpdpwuud Vx,Hx,Wx (v)
+d3: vpdpwsuds Vx,Hx,Wx (F3),(v) | vpdpwusds Vx,Hx,Wx (66),(v) | vpdpwuuds Vx,Hx,Wx (v)
+d8: AESENCWIDE128KL Qpi (F3),(000),(00B) | AESENCWIDE256KL Qpi (F3),(000),(10B) | AESDECWIDE128KL Qpi (F3),(000),(01B) | AESDECWIDE256KL Qpi (F3),(000),(11B)
+da: vsm3msg1 Vdq,Hdq,Udq (v1) | vsm3msg2 Vdq,Hdq,Udq (66),(v1) | vsm4key4 Vx,Hx,Wx (F3),(v) | vsm4rnds4 Vx,Hx,Wx (F2),(v)
db: VAESIMC Vdq,Wdq (66),(v1)
-dc: vaesenc Vx,Hx,Wx (66)
-dd: vaesenclast Vx,Hx,Wx (66)
-de: vaesdec Vx,Hx,Wx (66)
-df: vaesdeclast Vx,Hx,Wx (66)
+dc: vaesenc Vx,Hx,Wx (66) | LOADIWKEY Vx,Hx (F3) | AESENC128KL Vpd,Qpi (F3)
+dd: vaesenclast Vx,Hx,Wx (66) | AESDEC128KL Vpd,Qpi (F3)
+de: vaesdec Vx,Hx,Wx (66) | AESENC256KL Vpd,Qpi (F3)
+df: vaesdeclast Vx,Hx,Wx (66) | AESDEC256KL Vpd,Qpi (F3)
+e0: CMPOXADD My,Gy,By (66),(v1),(o64)
+e1: CMPNOXADD My,Gy,By (66),(v1),(o64)
+e2: CMPBXADD My,Gy,By (66),(v1),(o64)
+e3: CMPNBXADD My,Gy,By (66),(v1),(o64)
+e4: CMPZXADD My,Gy,By (66),(v1),(o64)
+e5: CMPNZXADD My,Gy,By (66),(v1),(o64)
+e6: CMPBEXADD My,Gy,By (66),(v1),(o64)
+e7: CMPNBEXADD My,Gy,By (66),(v1),(o64)
+e8: CMPSXADD My,Gy,By (66),(v1),(o64)
+e9: CMPNSXADD My,Gy,By (66),(v1),(o64)
+ea: CMPPXADD My,Gy,By (66),(v1),(o64)
+eb: CMPNPXADD My,Gy,By (66),(v1),(o64)
+ec: CMPLXADD My,Gy,By (66),(v1),(o64)
+ed: CMPNLXADD My,Gy,By (66),(v1),(o64)
+ee: CMPLEXADD My,Gy,By (66),(v1),(o64)
+ef: CMPNLEXADD My,Gy,By (66),(v1),(o64)
f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
f2: ANDN Gy,By,Ey (v)
@@ -812,8 +841,11 @@ f3: Grp17 (1A)
f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) | WRUSSD/Q My,Gy (66)
f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v) | WRSSD/Q My,Gy
f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
-f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3)
+f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3) | URDMSR Rq,Gq (F2),(11B) | UWRMSR Gq,Rq (F3),(11B)
f9: MOVDIRI My,Gy
+fa: ENCODEKEY128 Ew,Ew (F3)
+fb: ENCODEKEY256 Ew,Ew (F3)
+fc: AADD My,Gy | AAND My,Gy (66) | AOR My,Gy (F2) | AXOR My,Gy (F3)
EndTable
Table: 3-byte opcode 2 (0x0f 0x3a)
@@ -893,10 +925,103 @@ c2: vcmpph Vx,Hx,Wx,Ib (ev) | vcmpsh Vx,Hx,Wx,Ib (F3),(ev)
cc: sha1rnds4 Vdq,Wdq,Ib
ce: vgf2p8affineqb Vx,Wx,Ib (66)
cf: vgf2p8affineinvqb Vx,Wx,Ib (66)
+de: vsm3rnds2 Vdq,Hdq,Wdq,Ib (66),(v1)
df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
f0: RORX Gy,Ey,Ib (F2),(v) | HRESET Gv,Ib (F3),(000),(11B)
EndTable
+Table: EVEX map 4
+Referrer:
+AVXcode: 4
+00: ADD Eb,Gb (ev)
+01: ADD Ev,Gv (es) | ADD Ev,Gv (66),(es)
+02: ADD Gb,Eb (ev)
+03: ADD Gv,Ev (es) | ADD Gv,Ev (66),(es)
+08: OR Eb,Gb (ev)
+09: OR Ev,Gv (es) | OR Ev,Gv (66),(es)
+0a: OR Gb,Eb (ev)
+0b: OR Gv,Ev (es) | OR Gv,Ev (66),(es)
+10: ADC Eb,Gb (ev)
+11: ADC Ev,Gv (es) | ADC Ev,Gv (66),(es)
+12: ADC Gb,Eb (ev)
+13: ADC Gv,Ev (es) | ADC Gv,Ev (66),(es)
+18: SBB Eb,Gb (ev)
+19: SBB Ev,Gv (es) | SBB Ev,Gv (66),(es)
+1a: SBB Gb,Eb (ev)
+1b: SBB Gv,Ev (es) | SBB Gv,Ev (66),(es)
+20: AND Eb,Gb (ev)
+21: AND Ev,Gv (es) | AND Ev,Gv (66),(es)
+22: AND Gb,Eb (ev)
+23: AND Gv,Ev (es) | AND Gv,Ev (66),(es)
+24: SHLD Ev,Gv,Ib (es) | SHLD Ev,Gv,Ib (66),(es)
+28: SUB Eb,Gb (ev)
+29: SUB Ev,Gv (es) | SUB Ev,Gv (66),(es)
+2a: SUB Gb,Eb (ev)
+2b: SUB Gv,Ev (es) | SUB Gv,Ev (66),(es)
+2c: SHRD Ev,Gv,Ib (es) | SHRD Ev,Gv,Ib (66),(es)
+30: XOR Eb,Gb (ev)
+31: XOR Ev,Gv (es) | XOR Ev,Gv (66),(es)
+32: XOR Gb,Eb (ev)
+33: XOR Gv,Ev (es) | XOR Gv,Ev (66),(es)
+# CCMPSCC instructions are: CCOMB, CCOMBE, CCOMF, CCOML, CCOMLE, CCOMNB, CCOMNBE, CCOMNL, CCOMNLE,
+# CCOMNO, CCOMNS, CCOMNZ, CCOMO, CCOMS, CCOMT, CCOMZ
+38: CCMPSCC Eb,Gb (ev)
+39: CCMPSCC Ev,Gv (es) | CCMPSCC Ev,Gv (66),(es)
+3a: CCMPSCC Gv,Ev (ev)
+3b: CCMPSCC Gv,Ev (es) | CCMPSCC Gv,Ev (66),(es)
+40: CMOVO Gv,Ev (es) | CMOVO Gv,Ev (66),(es) | CFCMOVO Ev,Ev (es) | CFCMOVO Ev,Ev (66),(es) | SETO Eb (F2),(ev)
+41: CMOVNO Gv,Ev (es) | CMOVNO Gv,Ev (66),(es) | CFCMOVNO Ev,Ev (es) | CFCMOVNO Ev,Ev (66),(es) | SETNO Eb (F2),(ev)
+42: CMOVB Gv,Ev (es) | CMOVB Gv,Ev (66),(es) | CFCMOVB Ev,Ev (es) | CFCMOVB Ev,Ev (66),(es) | SETB Eb (F2),(ev)
+43: CMOVNB Gv,Ev (es) | CMOVNB Gv,Ev (66),(es) | CFCMOVNB Ev,Ev (es) | CFCMOVNB Ev,Ev (66),(es) | SETNB Eb (F2),(ev)
+44: CMOVZ Gv,Ev (es) | CMOVZ Gv,Ev (66),(es) | CFCMOVZ Ev,Ev (es) | CFCMOVZ Ev,Ev (66),(es) | SETZ Eb (F2),(ev)
+45: CMOVNZ Gv,Ev (es) | CMOVNZ Gv,Ev (66),(es) | CFCMOVNZ Ev,Ev (es) | CFCMOVNZ Ev,Ev (66),(es) | SETNZ Eb (F2),(ev)
+46: CMOVBE Gv,Ev (es) | CMOVBE Gv,Ev (66),(es) | CFCMOVBE Ev,Ev (es) | CFCMOVBE Ev,Ev (66),(es) | SETBE Eb (F2),(ev)
+47: CMOVNBE Gv,Ev (es) | CMOVNBE Gv,Ev (66),(es) | CFCMOVNBE Ev,Ev (es) | CFCMOVNBE Ev,Ev (66),(es) | SETNBE Eb (F2),(ev)
+48: CMOVS Gv,Ev (es) | CMOVS Gv,Ev (66),(es) | CFCMOVS Ev,Ev (es) | CFCMOVS Ev,Ev (66),(es) | SETS Eb (F2),(ev)
+49: CMOVNS Gv,Ev (es) | CMOVNS Gv,Ev (66),(es) | CFCMOVNS Ev,Ev (es) | CFCMOVNS Ev,Ev (66),(es) | SETNS Eb (F2),(ev)
+4a: CMOVP Gv,Ev (es) | CMOVP Gv,Ev (66),(es) | CFCMOVP Ev,Ev (es) | CFCMOVP Ev,Ev (66),(es) | SETP Eb (F2),(ev)
+4b: CMOVNP Gv,Ev (es) | CMOVNP Gv,Ev (66),(es) | CFCMOVNP Ev,Ev (es) | CFCMOVNP Ev,Ev (66),(es) | SETNP Eb (F2),(ev)
+4c: CMOVL Gv,Ev (es) | CMOVL Gv,Ev (66),(es) | CFCMOVL Ev,Ev (es) | CFCMOVL Ev,Ev (66),(es) | SETL Eb (F2),(ev)
+4d: CMOVNL Gv,Ev (es) | CMOVNL Gv,Ev (66),(es) | CFCMOVNL Ev,Ev (es) | CFCMOVNL Ev,Ev (66),(es) | SETNL Eb (F2),(ev)
+4e: CMOVLE Gv,Ev (es) | CMOVLE Gv,Ev (66),(es) | CFCMOVLE Ev,Ev (es) | CFCMOVLE Ev,Ev (66),(es) | SETLE Eb (F2),(ev)
+4f: CMOVNLE Gv,Ev (es) | CMOVNLE Gv,Ev (66),(es) | CFCMOVNLE Ev,Ev (es) | CFCMOVNLE Ev,Ev (66),(es) | SETNLE Eb (F2),(ev)
+60: MOVBE Gv,Ev (es) | MOVBE Gv,Ev (66),(es)
+61: MOVBE Ev,Gv (es) | MOVBE Ev,Gv (66),(es)
+65: WRUSSD Md,Gd (66),(ev) | WRUSSQ Mq,Gq (66),(ev)
+66: ADCX Gy,Ey (66),(ev) | ADOX Gy,Ey (F3),(ev) | WRSSD Md,Gd (ev) | WRSSQ Mq,Gq (66),(ev)
+69: IMUL Gv,Ev,Iz (es) | IMUL Gv,Ev,Iz (66),(es)
+6b: IMUL Gv,Ev,Ib (es) | IMUL Gv,Ev,Ib (66),(es)
+80: Grp1 Eb,Ib (1A),(ev)
+81: Grp1 Ev,Iz (1A),(es)
+83: Grp1 Ev,Ib (1A),(es)
+# CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL,
+# CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ
+84: CTESTSCC (ev)
+85: CTESTSCC (es) | CTESTSCC (66),(es)
+88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es)
+8f: POP2 Bq,Rq (000),(11B),(ev)
+a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es)
+ad: SHRD Ev,Gv,CL (es) | SHRD Ev,Gv,CL (66),(es)
+af: IMUL Gv,Ev (es) | IMUL Gv,Ev (66),(es)
+c0: Grp2 Eb,Ib (1A),(ev)
+c1: Grp2 Ev,Ib (1A),(es)
+d0: Grp2 Eb,1 (1A),(ev)
+d1: Grp2 Ev,1 (1A),(es)
+d2: Grp2 Eb,CL (1A),(ev)
+d3: Grp2 Ev,CL (1A),(es)
+f0: CRC32 Gy,Eb (es) | INVEPT Gq,Mdq (F3),(ev)
+f1: CRC32 Gy,Ey (es) | CRC32 Gy,Ey (66),(es) | INVVPID Gy,Mdq (F3),(ev)
+f2: INVPCID Gy,Mdq (F3),(ev)
+f4: TZCNT Gv,Ev (es) | TZCNT Gv,Ev (66),(es)
+f5: LZCNT Gv,Ev (es) | LZCNT Gv,Ev (66),(es)
+f6: Grp3_1 Eb (1A),(ev)
+f7: Grp3_2 Ev (1A),(es)
+f8: MOVDIR64B Gv,Mdqq (66),(ev) | ENQCMD Gv,Mdqq (F2),(ev) | ENQCMDS Gv,Mdqq (F3),(ev) | URDMSR Rq,Gq (F2),(11B),(ev) | UWRMSR Gq,Rq (F3),(11B),(ev)
+f9: MOVDIRI My,Gy (ev)
+fe: Grp4 (1A),(ev)
+ff: Grp5 (1A),(es) | PUSH2 Bq,Rq (110),(11B),(ev)
+EndTable
+
Table: EVEX map 5
Referrer:
AVXcode: 5
@@ -975,6 +1100,12 @@ d6: vfcmulcph Vx,Hx,Wx (F2),(ev) | vfmulcph Vx,Hx,Wx (F3),(ev)
d7: vfcmulcsh Vx,Hx,Wx (F2),(ev) | vfmulcsh Vx,Hx,Wx (F3),(ev)
EndTable
+Table: VEX map 7
+Referrer:
+AVXcode: 7
+f8: URDMSR Rq,Id (F2),(v1),(11B) | UWRMSR Id,Rq (F3),(v1),(11B)
+EndTable
+
GrpTable: Grp1
0: ADD
1: OR
@@ -1051,7 +1182,7 @@ GrpTable: Grp6
EndTable
GrpTable: Grp7
-0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B)
+0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B) | WRMSRNS (110),(11B) | RDMSRLIST (F2),(110),(11B) | WRMSRLIST (F3),(110),(11B) | PBNDKB (111),(11B)
1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B) | ERETU (F3),(010),(11B) | ERETS (F2),(010),(11B)
2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B)
3: LIDT Ms
@@ -1137,6 +1268,8 @@ GrpTable: Grp16
1: prefetch T0
2: prefetch T1
3: prefetch T2
+6: prefetch IT1
+7: prefetch IT0
EndTable
GrpTable: Grp17
diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c
index 1b118fd93140..39423ec409e1 100644
--- a/arch/x86/math-emu/fpu_etc.c
+++ b/arch/x86/math-emu/fpu_etc.c
@@ -120,9 +120,14 @@ static void fxam(FPU_REG *st0_ptr, u_char st0tag)
setcc(c);
}
+static void FPU_ST0_illegal(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ FPU_illegal();
+}
+
static FUNC_ST0 const fp_etc_table[] = {
- fchs, fabs, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal,
- ftst_, fxam, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal
+ fchs, fabs, FPU_ST0_illegal, FPU_ST0_illegal,
+ ftst_, fxam, FPU_ST0_illegal, FPU_ST0_illegal,
};
void FPU_etc(void)
diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c
index 990d847ae902..85daf98c81c3 100644
--- a/arch/x86/math-emu/fpu_trig.c
+++ b/arch/x86/math-emu/fpu_trig.c
@@ -433,13 +433,13 @@ static void fxtract(FPU_REG *st0_ptr, u_char st0_tag)
#endif /* PARANOID */
}
-static void fdecstp(void)
+static void fdecstp(FPU_REG *st0_ptr, u_char st0_tag)
{
clear_C1();
top--;
}
-static void fincstp(void)
+static void fincstp(FPU_REG *st0_ptr, u_char st0_tag)
{
clear_C1();
top++;
@@ -1631,7 +1631,7 @@ static void fscale(FPU_REG *st0_ptr, u_char st0_tag)
static FUNC_ST0 const trig_table_a[] = {
f2xm1, fyl2x, fptan, fpatan,
- fxtract, fprem1, (FUNC_ST0) fdecstp, (FUNC_ST0) fincstp
+ fxtract, fprem1, fdecstp, fincstp,
};
void FPU_triga(void)
diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c
index 742619e94bdf..003a0b2753e6 100644
--- a/arch/x86/math-emu/reg_constant.c
+++ b/arch/x86/math-emu/reg_constant.c
@@ -108,8 +108,13 @@ static void fldz(int rc)
typedef void (*FUNC_RC) (int);
+static void FPU_RC_illegal(int unused)
+{
+ FPU_illegal();
+}
+
static FUNC_RC constants_table[] = {
- fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC) FPU_illegal
+ fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, FPU_RC_illegal
};
void fconst(void)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 428048e73bd2..690fbf48e853 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -34,7 +34,7 @@ obj-y += pat/
CFLAGS_physaddr.o := -fno-stack-protector
CFLAGS_mem_encrypt_identity.o := -fno-stack-protector
-CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace
+CFLAGS_fault.o := -I $(src)/../include/asm/trace
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
@@ -57,7 +57,6 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
-obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 9332b36a1091..628833afee37 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
+#include <linux/numa_memblks.h>
#include <asm/io.h>
#include <linux/pci_ids.h>
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index e91500a80963..575f863f3c75 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -164,7 +164,7 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
}
}
#else
-static inline void percpu_setup_exception_stacks(unsigned int cpu)
+static void __init percpu_setup_exception_stacks(unsigned int cpu)
{
struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index b522933bfa56..51986e8a9d35 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -164,13 +164,6 @@ static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
return ex_handler_default(fixup, regs);
}
-static bool ex_handler_copy(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
-{
- WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
- return ex_handler_fault(fixup, regs, trapnr);
-}
-
static bool ex_handler_msr(const struct exception_table_entry *fixup,
struct pt_regs *regs, bool wrmsr, bool safe, int reg)
{
@@ -341,8 +334,6 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
return ex_handler_fault(e, regs, trapnr);
case EX_TYPE_UACCESS:
return ex_handler_uaccess(e, regs, trapnr, fault_addr);
- case EX_TYPE_COPY:
- return ex_handler_copy(e, regs, trapnr);
case EX_TYPE_CLEAR_FS:
return ex_handler_clear_fs(e, regs);
case EX_TYPE_FPU_RESTORE:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index bba4e020dd64..e6c469b323cc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,7 @@
#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <linux/mm.h> /* find_and_lock_vma() */
+#include <linux/vmalloc.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -514,18 +515,19 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
if (error_code & X86_PF_INSTR) {
unsigned int level;
+ bool nx, rw;
pgd_t *pgd;
pte_t *pte;
pgd = __va(read_cr3_pa());
pgd += pgd_index(address);
- pte = lookup_address_in_pgd(pgd, address, &level);
+ pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw);
- if (pte && pte_present(*pte) && !pte_exec(*pte))
+ if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx))
pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
from_kuid(&init_user_ns, current_uid()));
- if (pte && pte_present(*pte) && pte_exec(*pte) &&
+ if (pte && pte_present(*pte) && pte_exec(*pte) && !nx &&
(pgd_flags(*pgd) & _PAGE_USER) &&
(__read_cr4() & X86_CR4_SMEP))
pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
@@ -834,14 +836,17 @@ bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, u32 pkey, int si_code)
+ unsigned long address, struct mm_struct *mm,
+ struct vm_area_struct *vma, u32 pkey, int si_code)
{
- struct mm_struct *mm = current->mm;
/*
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
- mmap_read_unlock(mm);
+ if (mm)
+ mmap_read_unlock(mm);
+ else
+ vma_end_read(vma);
__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}
@@ -865,7 +870,8 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, struct vm_area_struct *vma)
+ unsigned long address, struct mm_struct *mm,
+ struct vm_area_struct *vma)
{
/*
* This OSPKE check is not strictly necessary at runtime.
@@ -895,9 +901,9 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
*/
u32 pkey = vma_pkey(vma);
- __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
+ __bad_area(regs, error_code, address, mm, vma, pkey, SEGV_PKUERR);
} else {
- __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
+ __bad_area(regs, error_code, address, mm, vma, 0, SEGV_ACCERR);
}
}
@@ -1325,8 +1331,9 @@ void do_user_addr_fault(struct pt_regs *regs,
goto lock_mmap;
if (unlikely(access_error(error_code, vma))) {
- vma_end_read(vma);
- goto lock_mmap;
+ bad_area_access_error(regs, error_code, address, NULL, vma);
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ return;
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
@@ -1362,7 +1369,7 @@ retry:
* we can handle it..
*/
if (unlikely(access_error(error_code, vma))) {
- bad_area_access_error(regs, error_code, address, vma);
+ bad_area_access_error(regs, error_code, address, mm, vma);
return;
}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 5804bbae4f01..807a5859a3c4 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -19,41 +19,14 @@
#include <asm/tlbflush.h>
#include <asm/elf.h>
-/*
- * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
- * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
- * Otherwise, returns 0.
- */
-int pmd_huge(pmd_t pmd)
-{
- return !pmd_none(pmd) &&
- (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
-}
-
-/*
- * pud_huge() returns 1 if @pud is hugetlb related entry, that is normal
- * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
- * Otherwise, returns 0.
- */
-int pud_huge(pud_t pud)
-{
-#if CONFIG_PGTABLE_LEVELS > 2
- return !pud_none(pud) &&
- (pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
-#else
- return 0;
-#endif
-}
-
#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
- info.flags = 0;
info.length = len;
info.low_limit = get_mmap_base(1);
@@ -65,7 +38,6 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
return vm_unmapped_area(&info);
}
@@ -74,7 +46,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
@@ -89,7 +61,6 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
@@ -141,7 +112,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
}
get_unmapped_area:
- if (mm->get_unmapped_area == arch_get_unmapped_area)
+ if (!test_bit(MMF_TOPDOWN, &mm->flags))
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
pgoff, flags);
else
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 968d7005f4a7..437e96fb4977 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -4,6 +4,79 @@
* included by both the compressed kernel and the regular kernel.
*/
+static void free_pte(struct x86_mapping_info *info, pmd_t *pmd)
+{
+ pte_t *pte = pte_offset_kernel(pmd, 0);
+
+ info->free_pgt_page(pte, info->context);
+}
+
+static void free_pmd(struct x86_mapping_info *info, pud_t *pud)
+{
+ pmd_t *pmd = pmd_offset(pud, 0);
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_present(pmd[i]))
+ continue;
+
+ if (pmd_leaf(pmd[i]))
+ continue;
+
+ free_pte(info, &pmd[i]);
+ }
+
+ info->free_pgt_page(pmd, info->context);
+}
+
+static void free_pud(struct x86_mapping_info *info, p4d_t *p4d)
+{
+ pud_t *pud = pud_offset(p4d, 0);
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ if (!pud_present(pud[i]))
+ continue;
+
+ if (pud_leaf(pud[i]))
+ continue;
+
+ free_pmd(info, &pud[i]);
+ }
+
+ info->free_pgt_page(pud, info->context);
+}
+
+static void free_p4d(struct x86_mapping_info *info, pgd_t *pgd)
+{
+ p4d_t *p4d = p4d_offset(pgd, 0);
+ int i;
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ if (!p4d_present(p4d[i]))
+ continue;
+
+ free_pud(info, &p4d[i]);
+ }
+
+ if (pgtable_l5_enabled())
+ info->free_pgt_page(p4d, info->context);
+}
+
+void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PGD; i++) {
+ if (!pgd_present(pgd[i]))
+ continue;
+
+ free_p4d(info, &pgd[i]);
+ }
+
+ info->free_pgt_page(pgd, info->context);
+}
+
static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
unsigned long addr, unsigned long end)
{
@@ -26,18 +99,31 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
for (; addr < end; addr = next) {
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
+ bool use_gbpage;
next = (addr & PUD_MASK) + PUD_SIZE;
if (next > end)
next = end;
- if (info->direct_gbpages) {
- pud_t pudval;
+ /* if this is already a gbpage, this portion is already mapped */
+ if (pud_leaf(*pud))
+ continue;
+
+ /* Is using a gbpage allowed? */
+ use_gbpage = info->direct_gbpages;
+
+ /* Don't use gbpage if it maps more than the requested region. */
+ /* at the begining: */
+ use_gbpage &= ((addr & ~PUD_MASK) == 0);
+ /* ... or at the end: */
+ use_gbpage &= ((next & ~PUD_MASK) == 0);
- if (pud_present(*pud))
- continue;
+ /* Never overwrite existing mappings */
+ use_gbpage &= !pud_present(*pud);
+
+ if (use_gbpage) {
+ pud_t pudval;
- addr &= PUD_MASK;
pudval = __pud((addr - info->offset) | info->page_flag);
set_pud(pud, pudval);
continue;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 679893ea5e68..eb503f53c319 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -7,6 +7,7 @@
#include <linux/swapops.h>
#include <linux/kmemleak.h>
#include <linux/sched/task.h>
+#include <linux/execmem.h>
#include <asm/set_memory.h>
#include <asm/cpu_device_id.h>
@@ -261,21 +262,17 @@ static void __init probe_page_size_mask(void)
}
}
-#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \
- .family = 6, \
- .model = _model, \
- }
/*
* INVLPG may not properly flush Global entries
* on these CPUs when PCIDs are enabled.
*/
static const struct x86_cpu_id invlpg_miss_ids[] = {
- INTEL_MATCH(INTEL_FAM6_ALDERLAKE ),
- INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ),
- INTEL_MATCH(INTEL_FAM6_ATOM_GRACEMONT ),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, 0),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, 0),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0),
{}
};
@@ -990,53 +987,6 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
}
#endif
-/*
- * Calculate the precise size of the DMA zone (first 16 MB of RAM),
- * and pass it to the MM layer - to help it set zone watermarks more
- * accurately.
- *
- * Done on 64-bit systems only for the time being, although 32-bit systems
- * might benefit from this as well.
- */
-void __init memblock_find_dma_reserve(void)
-{
-#ifdef CONFIG_X86_64
- u64 nr_pages = 0, nr_free_pages = 0;
- unsigned long start_pfn, end_pfn;
- phys_addr_t start_addr, end_addr;
- int i;
- u64 u;
-
- /*
- * Iterate over all memory ranges (free and reserved ones alike),
- * to calculate the total number of pages in the first 16 MB of RAM:
- */
- nr_pages = 0;
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
- start_pfn = min(start_pfn, MAX_DMA_PFN);
- end_pfn = min(end_pfn, MAX_DMA_PFN);
-
- nr_pages += end_pfn - start_pfn;
- }
-
- /*
- * Iterate over free memory ranges to calculate the number of free
- * pages in the DMA zone, while not counting potential partial
- * pages at the beginning or the end of the range:
- */
- nr_free_pages = 0;
- for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
- start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN);
- end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN);
-
- if (start_pfn < end_pfn)
- nr_free_pages += end_pfn - start_pfn;
- }
-
- set_dma_reserve(nr_pages - nr_free_pages);
-#endif
-}
-
void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -1099,3 +1049,31 @@ unsigned long arch_max_swapfile_size(void)
return pages;
}
#endif
+
+#ifdef CONFIG_EXECMEM
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+ unsigned long start, offset = 0;
+
+ if (kaslr_enabled())
+ offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
+
+ start = MODULES_VADDR + offset;
+
+ execmem_info = (struct execmem_info){
+ .ranges = {
+ [EXECMEM_DEFAULT] = {
+ .flags = EXECMEM_KASAN_SHADOW,
+ .start = start,
+ .end = MODULES_END,
+ .pgprot = PAGE_KERNEL,
+ .alignment = MODULE_ALIGN,
+ },
+ },
+ };
+
+ return &execmem_info;
+}
+#endif /* CONFIG_EXECMEM */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 7e177856ee4f..ff253648706f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -469,7 +469,9 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN))
+ E820_TYPE_RESERVED_KERN) &&
+ !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
+ E820_TYPE_ACPI))
set_pte_init(pte, __pte(0), init);
continue;
}
@@ -524,7 +526,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN))
+ E820_TYPE_RESERVED_KERN) &&
+ !e820__mapped_any(paddr & PMD_MASK, paddr_next,
+ E820_TYPE_ACPI))
set_pmd_init(pmd, __pmd(0), init);
continue;
}
@@ -611,7 +615,9 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN))
+ E820_TYPE_RESERVED_KERN) &&
+ !e820__mapped_any(paddr & PUD_MASK, paddr_next,
+ E820_TYPE_ACPI))
set_pud_init(pud, __pud(0), init);
continue;
}
@@ -698,7 +704,9 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
- E820_TYPE_RESERVED_KERN))
+ E820_TYPE_RESERVED_KERN) &&
+ !e820__mapped_any(paddr & P4D_MASK, paddr_next,
+ E820_TYPE_ACPI))
set_p4d_init(p4d, __p4d(0), init);
continue;
}
@@ -950,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
struct mhp_params *params)
{
+ unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
int ret;
+ if (WARN_ON_ONCE(end > PHYSMEM_END))
+ return -ERANGE;
+
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
@@ -980,8 +992,6 @@ static void __meminit free_pagetable(struct page *page, int order)
/* bootmem page has reserved flag */
if (PageReserved(page)) {
- __ClearPageReserved(page);
-
magic = page->index;
if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
while (nr_pages--)
@@ -1354,18 +1364,6 @@ void __init mem_init(void)
preallocate_vmalloc_pages();
}
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
-{
- /*
- * More CPUs always led to greater speedups on tested systems, up to
- * all the nodes' CPUs. Use all since the system is otherwise idle
- * now.
- */
- return max_t(int, cpumask_weight(node_cpumask), 1);
-}
-#endif
-
int kernel_set_to_readonly;
void mark_rodata_ro(void)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index aa7d279321ea..70b02fc61d93 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/ioport.h>
+#include <linux/ioremap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mmiotrace.h>
@@ -457,7 +458,7 @@ void iounmap(volatile void __iomem *addr)
{
struct vm_struct *p, *o;
- if ((void __force *)addr <= high_memory)
+ if (WARN_ON_ONCE(!is_ioremap_addr((void __force *)addr)))
return;
/*
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 37db264866b6..230f1dee4f09 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
*/
static __initdata struct kaslr_memory_region {
unsigned long *base;
+ unsigned long *end;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 0 },
- { &vmalloc_base, 0 },
- { &vmemmap_base, 0 },
+ {
+ .base = &page_offset_base,
+ .end = &physmem_end,
+ },
+ {
+ .base = &vmalloc_base,
+ },
+ {
+ .base = &vmemmap_base,
+ },
};
+/* The end of the possible address space for physical memory */
+unsigned long physmem_end __ro_after_init;
+
/* Get size in bytes used by the memory region */
static inline unsigned long get_padding(struct kaslr_memory_region *region)
{
@@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void)
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+ /* Preset the end of the possible address space for physical memory */
+ physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
if (!kaslr_memory_enabled())
return;
@@ -128,11 +141,18 @@ void __init kernel_randomize_memory(void)
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
+ /* Calculate the end of the region */
+ vaddr += get_padding(&kaslr_regions[i]);
/*
- * Jump the region and add a minimum padding based on
- * randomization alignment.
+ * KASLR trims the maximum possible size of the
+ * direct-map. Update the physmem_end boundary.
+ * No rounding required as the region starts
+ * PUD aligned and size is in units of TB.
*/
- vaddr += get_padding(&kaslr_regions[i]);
+ if (kaslr_regions[i].end)
+ *kaslr_regions[i].end = __pa_nodebug(vaddr - 1);
+
+ /* Add a minimum padding based on randomization alignment. */
vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 422602f6039b..86a476a426c2 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -2,7 +2,7 @@
/*
* AMD Memory Encryption Support
*
- * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ * Copyright (C) 2016-2024 Advanced Micro Devices, Inc.
*
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
@@ -283,7 +283,7 @@ static void enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc)
#endif
}
-static bool amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc)
+static int amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc)
{
/*
* To maintain the security guarantees of SEV-SNP guests, make sure
@@ -292,11 +292,11 @@ static bool amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool
if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc)
snp_set_memory_shared(vaddr, npages);
- return true;
+ return 0;
}
/* Return true unconditionally: return value doesn't matter for the SEV side */
-static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc)
+static int amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc)
{
/*
* After memory is mapped encrypted in the page table, validate it
@@ -308,7 +308,7 @@ static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool e
if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
enc_dec_hypercall(vaddr, npages << PAGE_SHIFT, enc);
- return true;
+ return 0;
}
static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
@@ -510,6 +510,12 @@ void __init sme_early_init(void)
*/
x86_init.resources.dmi_setup = snp_dmi_setup;
}
+
+ /*
+ * Switch the SVSM CA mapping (if active) from identity mapped to
+ * kernel mapped.
+ */
+ snp_update_svsm_ca();
}
void __init mem_encrypt_free_decrypted_mem(void)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c90c20904a60..a2cabb1c81e1 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -129,9 +129,9 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
if (mmap_is_legacy())
- mm->get_unmapped_area = arch_get_unmapped_area;
+ clear_bit(MMF_TOPDOWN, &mm->flags);
else
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ set_bit(MMF_TOPDOWN, &mm->flags);
arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
arch_rnd(mmap64_rnd_bits), task_size_64bit(0),
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 65e9a6e391c0..64e5cdb2460a 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <linux/topology.h>
#include <linux/sort.h>
+#include <linux/numa_memblks.h>
#include <asm/e820/api.h>
#include <asm/proto.h>
@@ -22,16 +23,6 @@
#include "numa_internal.h"
int numa_off;
-nodemask_t numa_nodes_parsed __initdata;
-
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-
-static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
-static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
-
-static int numa_distance_cnt;
-static u8 *numa_distance;
static __init int numa_setup(char *opt)
{
@@ -124,456 +115,27 @@ void __init setup_node_to_cpumask_map(void)
pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}
-static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
- struct numa_meminfo *mi)
-{
- /* ignore zero length blks */
- if (start == end)
- return 0;
-
- /* whine about and ignore invalid blks */
- if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
- pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
- nid, start, end - 1);
- return 0;
- }
-
- if (mi->nr_blks >= NR_NODE_MEMBLKS) {
- pr_err("too many memblk ranges\n");
- return -EINVAL;
- }
-
- mi->blk[mi->nr_blks].start = start;
- mi->blk[mi->nr_blks].end = end;
- mi->blk[mi->nr_blks].nid = nid;
- mi->nr_blks++;
- return 0;
-}
-
-/**
- * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
- * @idx: Index of memblk to remove
- * @mi: numa_meminfo to remove memblk from
- *
- * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
- * decrementing @mi->nr_blks.
- */
-void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
-{
- mi->nr_blks--;
- memmove(&mi->blk[idx], &mi->blk[idx + 1],
- (mi->nr_blks - idx) * sizeof(mi->blk[0]));
-}
-
-/**
- * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
- * @dst: numa_meminfo to append block to
- * @idx: Index of memblk to remove
- * @src: numa_meminfo to remove memblk from
- */
-static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
- struct numa_meminfo *src)
-{
- dst->blk[dst->nr_blks++] = src->blk[idx];
- numa_remove_memblk_from(idx, src);
-}
-
-/**
- * numa_add_memblk - Add one numa_memblk to numa_meminfo
- * @nid: NUMA node ID of the new memblk
- * @start: Start address of the new memblk
- * @end: End address of the new memblk
- *
- * Add a new memblk to the default numa_meminfo.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_add_memblk(int nid, u64 start, u64 end)
-{
- return numa_add_memblk_to(nid, start, end, &numa_meminfo);
-}
-
-/* Allocate NODE_DATA for a node on the local memory */
-static void __init alloc_node_data(int nid)
+static int __init numa_register_nodes(void)
{
- const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- u64 nd_pa;
- void *nd;
- int tnid;
-
- /*
- * Allocate node data. Try node-local memory and then any node.
- * Never allocate in DMA zone.
- */
- nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
- if (!nd_pa) {
- pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
- nd_size, nid);
- return;
- }
- nd = __va(nd_pa);
-
- /* report and initialize */
- printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
- nd_pa, nd_pa + nd_size - 1);
- tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
- if (tnid != nid)
- printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
-
- node_data[nid] = nd;
- memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-
- node_set_online(nid);
-}
-
-/**
- * numa_cleanup_meminfo - Cleanup a numa_meminfo
- * @mi: numa_meminfo to clean up
- *
- * Sanitize @mi by merging and removing unnecessary memblks. Also check for
- * conflicts and clear unused memblks.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
-{
- const u64 low = 0;
- const u64 high = PFN_PHYS(max_pfn);
- int i, j, k;
-
- /* first, trim all entries */
- for (i = 0; i < mi->nr_blks; i++) {
- struct numa_memblk *bi = &mi->blk[i];
-
- /* move / save reserved memory ranges */
- if (!memblock_overlaps_region(&memblock.memory,
- bi->start, bi->end - bi->start)) {
- numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
- continue;
- }
-
- /* make sure all non-reserved blocks are inside the limits */
- bi->start = max(bi->start, low);
-
- /* preserve info for non-RAM areas above 'max_pfn': */
- if (bi->end > high) {
- numa_add_memblk_to(bi->nid, high, bi->end,
- &numa_reserved_meminfo);
- bi->end = high;
- }
-
- /* and there's no empty block */
- if (bi->start >= bi->end)
- numa_remove_memblk_from(i--, mi);
- }
-
- /* merge neighboring / overlapping entries */
- for (i = 0; i < mi->nr_blks; i++) {
- struct numa_memblk *bi = &mi->blk[i];
-
- for (j = i + 1; j < mi->nr_blks; j++) {
- struct numa_memblk *bj = &mi->blk[j];
- u64 start, end;
-
- /*
- * See whether there are overlapping blocks. Whine
- * about but allow overlaps of the same nid. They
- * will be merged below.
- */
- if (bi->end > bj->start && bi->start < bj->end) {
- if (bi->nid != bj->nid) {
- pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
- bi->nid, bi->start, bi->end - 1,
- bj->nid, bj->start, bj->end - 1);
- return -EINVAL;
- }
- pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
- bi->nid, bi->start, bi->end - 1,
- bj->start, bj->end - 1);
- }
-
- /*
- * Join together blocks on the same node, holes
- * between which don't overlap with memory on other
- * nodes.
- */
- if (bi->nid != bj->nid)
- continue;
- start = min(bi->start, bj->start);
- end = max(bi->end, bj->end);
- for (k = 0; k < mi->nr_blks; k++) {
- struct numa_memblk *bk = &mi->blk[k];
-
- if (bi->nid == bk->nid)
- continue;
- if (start < bk->end && end > bk->start)
- break;
- }
- if (k < mi->nr_blks)
- continue;
- printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
- bi->nid, bi->start, bi->end - 1, bj->start,
- bj->end - 1, start, end - 1);
- bi->start = start;
- bi->end = end;
- numa_remove_memblk_from(j--, mi);
- }
- }
-
- /* clear unused ones */
- for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
- mi->blk[i].start = mi->blk[i].end = 0;
- mi->blk[i].nid = NUMA_NO_NODE;
- }
-
- return 0;
-}
-
-/*
- * Set nodes, which have memory in @mi, in *@nodemask.
- */
-static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
- const struct numa_meminfo *mi)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
- if (mi->blk[i].start != mi->blk[i].end &&
- mi->blk[i].nid != NUMA_NO_NODE)
- node_set(mi->blk[i].nid, *nodemask);
-}
-
-/**
- * numa_reset_distance - Reset NUMA distance table
- *
- * The current table is freed. The next numa_set_distance() call will
- * create a new one.
- */
-void __init numa_reset_distance(void)
-{
- size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
-
- /* numa_distance could be 1LU marking allocation failure, test cnt */
- if (numa_distance_cnt)
- memblock_free(numa_distance, size);
- numa_distance_cnt = 0;
- numa_distance = NULL; /* enable table creation */
-}
-
-static int __init numa_alloc_distance(void)
-{
- nodemask_t nodes_parsed;
- size_t size;
- int i, j, cnt = 0;
- u64 phys;
-
- /* size the new table and allocate it */
- nodes_parsed = numa_nodes_parsed;
- numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
-
- for_each_node_mask(i, nodes_parsed)
- cnt = i;
- cnt++;
- size = cnt * cnt * sizeof(numa_distance[0]);
-
- phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0,
- PFN_PHYS(max_pfn_mapped));
- if (!phys) {
- pr_warn("Warning: can't allocate distance table!\n");
- /* don't retry until explicitly reset */
- numa_distance = (void *)1LU;
- return -ENOMEM;
- }
-
- numa_distance = __va(phys);
- numa_distance_cnt = cnt;
-
- /* fill with the default distances */
- for (i = 0; i < cnt; i++)
- for (j = 0; j < cnt; j++)
- numa_distance[i * cnt + j] = i == j ?
- LOCAL_DISTANCE : REMOTE_DISTANCE;
- printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
-
- return 0;
-}
-
-/**
- * numa_set_distance - Set NUMA distance from one NUMA to another
- * @from: the 'from' node to set distance
- * @to: the 'to' node to set distance
- * @distance: NUMA distance
- *
- * Set the distance from node @from to @to to @distance. If distance table
- * doesn't exist, one which is large enough to accommodate all the currently
- * known nodes will be created.
- *
- * If such table cannot be allocated, a warning is printed and further
- * calls are ignored until the distance table is reset with
- * numa_reset_distance().
- *
- * If @from or @to is higher than the highest known node or lower than zero
- * at the time of table creation or @distance doesn't make sense, the call
- * is ignored.
- * This is to allow simplification of specific NUMA config implementations.
- */
-void __init numa_set_distance(int from, int to, int distance)
-{
- if (!numa_distance && numa_alloc_distance() < 0)
- return;
-
- if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
- from < 0 || to < 0) {
- pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
- from, to, distance);
- return;
- }
-
- if ((u8)distance != distance ||
- (from == to && distance != LOCAL_DISTANCE)) {
- pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
- from, to, distance);
- return;
- }
-
- numa_distance[from * numa_distance_cnt + to] = distance;
-}
-
-int __node_distance(int from, int to)
-{
- if (from >= numa_distance_cnt || to >= numa_distance_cnt)
- return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
- return numa_distance[from * numa_distance_cnt + to];
-}
-EXPORT_SYMBOL(__node_distance);
-
-/*
- * Mark all currently memblock-reserved physical memory (which covers the
- * kernel's own memory ranges) as hot-unswappable.
- */
-static void __init numa_clear_kernel_node_hotplug(void)
-{
- nodemask_t reserved_nodemask = NODE_MASK_NONE;
- struct memblock_region *mb_region;
- int i;
-
- /*
- * We have to do some preprocessing of memblock regions, to
- * make them suitable for reservation.
- *
- * At this time, all memory regions reserved by memblock are
- * used by the kernel, but those regions are not split up
- * along node boundaries yet, and don't necessarily have their
- * node ID set yet either.
- *
- * So iterate over all memory known to the x86 architecture,
- * and use those ranges to set the nid in memblock.reserved.
- * This will split up the memblock regions along node
- * boundaries and will set the node IDs as well.
- */
- for (i = 0; i < numa_meminfo.nr_blks; i++) {
- struct numa_memblk *mb = numa_meminfo.blk + i;
- int ret;
-
- ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
- WARN_ON_ONCE(ret);
- }
-
- /*
- * Now go over all reserved memblock regions, to construct a
- * node mask of all kernel reserved memory areas.
- *
- * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
- * numa_meminfo might not include all memblock.reserved
- * memory ranges, because quirks such as trim_snb_memory()
- * reserve specific pages for Sandy Bridge graphics. ]
- */
- for_each_reserved_mem_region(mb_region) {
- int nid = memblock_get_region_node(mb_region);
-
- if (nid != MAX_NUMNODES)
- node_set(nid, reserved_nodemask);
- }
-
- /*
- * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
- * belonging to the reserved node mask.
- *
- * Note that this will include memory regions that reside
- * on nodes that contain kernel memory - entire nodes
- * become hot-unpluggable:
- */
- for (i = 0; i < numa_meminfo.nr_blks; i++) {
- struct numa_memblk *mb = numa_meminfo.blk + i;
-
- if (!node_isset(mb->nid, reserved_nodemask))
- continue;
-
- memblock_clear_hotplug(mb->start, mb->end - mb->start);
- }
-}
-
-static int __init numa_register_memblks(struct numa_meminfo *mi)
-{
- int i, nid;
-
- /* Account for nodes with cpus and no memory */
- node_possible_map = numa_nodes_parsed;
- numa_nodemask_from_meminfo(&node_possible_map, mi);
- if (WARN_ON(nodes_empty(node_possible_map)))
- return -EINVAL;
-
- for (i = 0; i < mi->nr_blks; i++) {
- struct numa_memblk *mb = &mi->blk[i];
- memblock_set_node(mb->start, mb->end - mb->start,
- &memblock.memory, mb->nid);
- }
-
- /*
- * At very early time, the kernel have to use some memory such as
- * loading the kernel image. We cannot prevent this anyway. So any
- * node the kernel resides in should be un-hotpluggable.
- *
- * And when we come here, alloc node data won't fail.
- */
- numa_clear_kernel_node_hotplug();
-
- /*
- * If sections array is gonna be used for pfn -> nid mapping, check
- * whether its granularity is fine enough.
- */
- if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
- unsigned long pfn_align = node_map_pfn_alignment();
-
- if (pfn_align && pfn_align < PAGES_PER_SECTION) {
- pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
- PFN_PHYS(pfn_align) >> 20,
- PFN_PHYS(PAGES_PER_SECTION) >> 20);
- return -EINVAL;
- }
- }
+ int nid;
if (!memblock_validate_numa_coverage(SZ_1M))
return -EINVAL;
/* Finally register nodes. */
for_each_node_mask(nid, node_possible_map) {
- u64 start = PFN_PHYS(max_pfn);
- u64 end = 0;
-
- for (i = 0; i < mi->nr_blks; i++) {
- if (nid != mi->blk[i].nid)
- continue;
- start = min(mi->blk[i].start, start);
- end = max(mi->blk[i].end, end);
- }
+ unsigned long start_pfn, end_pfn;
- if (start >= end)
+ /*
+ * Note, get_pfn_range_for_nid() depends on
+ * memblock_set_node() having already happened
+ */
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ if (start_pfn >= end_pfn)
continue;
alloc_node_data(nid);
+ node_set_online(nid);
}
/* Dump memblock with node info and return. */
@@ -609,39 +171,11 @@ static int __init numa_init(int (*init_func)(void))
for (i = 0; i < MAX_LOCAL_APIC; i++)
set_apicid_to_node(i, NUMA_NO_NODE);
- nodes_clear(numa_nodes_parsed);
- nodes_clear(node_possible_map);
- nodes_clear(node_online_map);
- memset(&numa_meminfo, 0, sizeof(numa_meminfo));
- WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
- MAX_NUMNODES));
- WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
- MAX_NUMNODES));
- /* In case that parsing SRAT failed. */
- WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
- numa_reset_distance();
-
- ret = init_func();
- if (ret < 0)
- return ret;
-
- /*
- * We reset memblock back to the top-down direction
- * here because if we configured ACPI_NUMA, we have
- * parsed SRAT in init_func(). It is ok to have the
- * reset here even if we did't configure ACPI_NUMA
- * or acpi numa init fails and fallbacks to dummy
- * numa init.
- */
- memblock_set_bottom_up(false);
-
- ret = numa_cleanup_meminfo(&numa_meminfo);
+ ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true);
if (ret < 0)
return ret;
- numa_emulation(&numa_meminfo, numa_distance_cnt);
-
- ret = numa_register_memblks(&numa_meminfo);
+ ret = numa_register_nodes();
if (ret < 0)
return ret;
@@ -782,12 +316,12 @@ void __init init_cpu_to_node(void)
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
# ifndef CONFIG_NUMA_EMU
-void numa_add_cpu(int cpu)
+void numa_add_cpu(unsigned int cpu)
{
cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
-void numa_remove_cpu(int cpu)
+void numa_remove_cpu(unsigned int cpu)
{
cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
@@ -825,7 +359,7 @@ int early_cpu_to_node(int cpu)
return per_cpu(x86_cpu_to_node_map, cpu);
}
-void debug_cpumask_set_cpu(int cpu, int node, bool enable)
+void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable)
{
struct cpumask *mask;
@@ -857,12 +391,12 @@ static void numa_set_cpumask(int cpu, bool enable)
debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
}
-void numa_add_cpu(int cpu)
+void numa_add_cpu(unsigned int cpu)
{
numa_set_cpumask(cpu, true);
}
-void numa_remove_cpu(int cpu)
+void numa_remove_cpu(unsigned int cpu)
{
numa_set_cpumask(cpu, false);
}
@@ -893,113 +427,29 @@ EXPORT_SYMBOL(cpumask_of_node);
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
-#ifdef CONFIG_NUMA_KEEP_MEMINFO
-static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
+#ifdef CONFIG_NUMA_EMU
+void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
+ unsigned int nr_emu_nids)
{
- int i;
-
- for (i = 0; i < mi->nr_blks; i++)
- if (mi->blk[i].start <= start && mi->blk[i].end > start)
- return mi->blk[i].nid;
- return NUMA_NO_NODE;
-}
-
-int phys_to_target_node(phys_addr_t start)
-{
- int nid = meminfo_to_nid(&numa_meminfo, start);
+ int i, j;
/*
- * Prefer online nodes, but if reserved memory might be
- * hot-added continue the search with reserved ranges.
+ * Transform __apicid_to_node table to use emulated nids by
+ * reverse-mapping phys_nid. The maps should always exist but fall
+ * back to zero just in case.
*/
- if (nid != NUMA_NO_NODE)
- return nid;
-
- return meminfo_to_nid(&numa_reserved_meminfo, start);
-}
-EXPORT_SYMBOL_GPL(phys_to_target_node);
-
-int memory_add_physaddr_to_nid(u64 start)
-{
- int nid = meminfo_to_nid(&numa_meminfo, start);
-
- if (nid == NUMA_NO_NODE)
- nid = numa_meminfo.blk[0].nid;
- return nid;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-
-static int __init cmp_memblk(const void *a, const void *b)
-{
- const struct numa_memblk *ma = *(const struct numa_memblk **)a;
- const struct numa_memblk *mb = *(const struct numa_memblk **)b;
-
- return (ma->start > mb->start) - (ma->start < mb->start);
+ for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
+ if (__apicid_to_node[i] == NUMA_NO_NODE)
+ continue;
+ for (j = 0; j < nr_emu_nids; j++)
+ if (__apicid_to_node[i] == emu_nid_to_phys[j])
+ break;
+ __apicid_to_node[i] = j < nr_emu_nids ? j : 0;
+ }
}
-static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
-
-/**
- * numa_fill_memblks - Fill gaps in numa_meminfo memblks
- * @start: address to begin fill
- * @end: address to end fill
- *
- * Find and extend numa_meminfo memblks to cover the physical
- * address range @start-@end
- *
- * RETURNS:
- * 0 : Success
- * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
- */
-
-int __init numa_fill_memblks(u64 start, u64 end)
+u64 __init numa_emu_dma_end(void)
{
- struct numa_memblk **blk = &numa_memblk_list[0];
- struct numa_meminfo *mi = &numa_meminfo;
- int count = 0;
- u64 prev_end;
-
- /*
- * Create a list of pointers to numa_meminfo memblks that
- * overlap start, end. The list is used to make in-place
- * changes that fill out the numa_meminfo memblks.
- */
- for (int i = 0; i < mi->nr_blks; i++) {
- struct numa_memblk *bi = &mi->blk[i];
-
- if (memblock_addrs_overlap(start, end - start, bi->start,
- bi->end - bi->start)) {
- blk[count] = &mi->blk[i];
- count++;
- }
- }
- if (!count)
- return NUMA_NO_MEMBLK;
-
- /* Sort the list of pointers in memblk->start order */
- sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL);
-
- /* Make sure the first/last memblks include start/end */
- blk[0]->start = min(blk[0]->start, start);
- blk[count - 1]->end = max(blk[count - 1]->end, end);
-
- /*
- * Fill any gaps by tracking the previous memblks
- * end address and backfilling to it if needed.
- */
- prev_end = blk[0]->end;
- for (int i = 1; i < count; i++) {
- struct numa_memblk *curr = blk[i];
-
- if (prev_end >= curr->start) {
- if (prev_end < curr->end)
- prev_end = curr->end;
- } else {
- curr->start = prev_end;
- prev_end = curr->end;
- }
- }
- return 0;
+ return PFN_PHYS(MAX_DMA32_PFN);
}
-
-#endif
+#endif /* CONFIG_NUMA_EMU */
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 025fd7ea5d69..65fda406e6f2 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -24,6 +24,7 @@
#include <linux/memblock.h>
#include <linux/init.h>
+#include <linux/vmalloc.h>
#include <asm/pgtable_areas.h>
#include "numa_internal.h"
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
deleted file mode 100644
index 9a9305367fdd..000000000000
--- a/arch/x86/mm/numa_emulation.c
+++ /dev/null
@@ -1,585 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NUMA emulation
- */
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/topology.h>
-#include <linux/memblock.h>
-#include <asm/dma.h>
-
-#include "numa_internal.h"
-
-static int emu_nid_to_phys[MAX_NUMNODES];
-static char *emu_cmdline __initdata;
-
-int __init numa_emu_cmdline(char *str)
-{
- emu_cmdline = str;
- return 0;
-}
-
-static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
-{
- int i;
-
- for (i = 0; i < mi->nr_blks; i++)
- if (mi->blk[i].nid == nid)
- return i;
- return -ENOENT;
-}
-
-static u64 __init mem_hole_size(u64 start, u64 end)
-{
- unsigned long start_pfn = PFN_UP(start);
- unsigned long end_pfn = PFN_DOWN(end);
-
- if (start_pfn < end_pfn)
- return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
- return 0;
-}
-
-/*
- * Sets up nid to range from @start to @end. The return value is -errno if
- * something went wrong, 0 otherwise.
- */
-static int __init emu_setup_memblk(struct numa_meminfo *ei,
- struct numa_meminfo *pi,
- int nid, int phys_blk, u64 size)
-{
- struct numa_memblk *eb = &ei->blk[ei->nr_blks];
- struct numa_memblk *pb = &pi->blk[phys_blk];
-
- if (ei->nr_blks >= NR_NODE_MEMBLKS) {
- pr_err("NUMA: Too many emulated memblks, failing emulation\n");
- return -EINVAL;
- }
-
- ei->nr_blks++;
- eb->start = pb->start;
- eb->end = pb->start + size;
- eb->nid = nid;
-
- if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
- emu_nid_to_phys[nid] = pb->nid;
-
- pb->start += size;
- if (pb->start >= pb->end) {
- WARN_ON_ONCE(pb->start > pb->end);
- numa_remove_memblk_from(phys_blk, pi);
- }
-
- printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
- nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
- return 0;
-}
-
-/*
- * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr.
- *
- * Returns zero on success or negative on error.
- */
-static int __init split_nodes_interleave(struct numa_meminfo *ei,
- struct numa_meminfo *pi,
- u64 addr, u64 max_addr, int nr_nodes)
-{
- nodemask_t physnode_mask = numa_nodes_parsed;
- u64 size;
- int big;
- int nid = 0;
- int i, ret;
-
- if (nr_nodes <= 0)
- return -1;
- if (nr_nodes > MAX_NUMNODES) {
- pr_info("numa=fake=%d too large, reducing to %d\n",
- nr_nodes, MAX_NUMNODES);
- nr_nodes = MAX_NUMNODES;
- }
-
- /*
- * Calculate target node size. x86_32 freaks on __udivdi3() so do
- * the division in ulong number of pages and convert back.
- */
- size = max_addr - addr - mem_hole_size(addr, max_addr);
- size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
-
- /*
- * Calculate the number of big nodes that can be allocated as a result
- * of consolidating the remainder.
- */
- big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
- FAKE_NODE_MIN_SIZE;
-
- size &= FAKE_NODE_MIN_HASH_MASK;
- if (!size) {
- pr_err("Not enough memory for each node. "
- "NUMA emulation disabled.\n");
- return -1;
- }
-
- /*
- * Continue to fill physical nodes with fake nodes until there is no
- * memory left on any of them.
- */
- while (!nodes_empty(physnode_mask)) {
- for_each_node_mask(i, physnode_mask) {
- u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
- u64 start, limit, end;
- int phys_blk;
-
- phys_blk = emu_find_memblk_by_nid(i, pi);
- if (phys_blk < 0) {
- node_clear(i, physnode_mask);
- continue;
- }
- start = pi->blk[phys_blk].start;
- limit = pi->blk[phys_blk].end;
- end = start + size;
-
- if (nid < big)
- end += FAKE_NODE_MIN_SIZE;
-
- /*
- * Continue to add memory to this fake node if its
- * non-reserved memory is less than the per-node size.
- */
- while (end - start - mem_hole_size(start, end) < size) {
- end += FAKE_NODE_MIN_SIZE;
- if (end > limit) {
- end = limit;
- break;
- }
- }
-
- /*
- * If there won't be at least FAKE_NODE_MIN_SIZE of
- * non-reserved memory in ZONE_DMA32 for the next node,
- * this one must extend to the boundary.
- */
- if (end < dma32_end && dma32_end - end -
- mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
- end = dma32_end;
-
- /*
- * If there won't be enough non-reserved memory for the
- * next node, this one must extend to the end of the
- * physical node.
- */
- if (limit - end - mem_hole_size(end, limit) < size)
- end = limit;
-
- ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
- phys_blk,
- min(end, limit) - start);
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
-/*
- * Returns the end address of a node so that there is at least `size' amount of
- * non-reserved memory or `max_addr' is reached.
- */
-static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
-{
- u64 end = start + size;
-
- while (end - start - mem_hole_size(start, end) < size) {
- end += FAKE_NODE_MIN_SIZE;
- if (end > max_addr) {
- end = max_addr;
- break;
- }
- }
- return end;
-}
-
-static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
-{
- unsigned long max_pfn = PHYS_PFN(max_addr);
- unsigned long base_pfn = PHYS_PFN(base);
- unsigned long hole_pfns = PHYS_PFN(hole);
-
- return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
-}
-
-/*
- * Sets up fake nodes of `size' interleaved over physical nodes ranging from
- * `addr' to `max_addr'.
- *
- * Returns zero on success or negative on error.
- */
-static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
- struct numa_meminfo *pi,
- u64 addr, u64 max_addr, u64 size,
- int nr_nodes, struct numa_memblk *pblk,
- int nid)
-{
- nodemask_t physnode_mask = numa_nodes_parsed;
- int i, ret, uniform = 0;
- u64 min_size;
-
- if ((!size && !nr_nodes) || (nr_nodes && !pblk))
- return -1;
-
- /*
- * In the 'uniform' case split the passed in physical node by
- * nr_nodes, in the non-uniform case, ignore the passed in
- * physical block and try to create nodes of at least size
- * @size.
- *
- * In the uniform case, split the nodes strictly by physical
- * capacity, i.e. ignore holes. In the non-uniform case account
- * for holes and treat @size as a minimum floor.
- */
- if (!nr_nodes)
- nr_nodes = MAX_NUMNODES;
- else {
- nodes_clear(physnode_mask);
- node_set(pblk->nid, physnode_mask);
- uniform = 1;
- }
-
- if (uniform) {
- min_size = uniform_size(max_addr, addr, 0, nr_nodes);
- size = min_size;
- } else {
- /*
- * The limit on emulated nodes is MAX_NUMNODES, so the
- * size per node is increased accordingly if the
- * requested size is too small. This creates a uniform
- * distribution of node sizes across the entire machine
- * (but not necessarily over physical nodes).
- */
- min_size = uniform_size(max_addr, addr,
- mem_hole_size(addr, max_addr), nr_nodes);
- }
- min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
- if (size < min_size) {
- pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
- size >> 20, min_size >> 20);
- size = min_size;
- }
- size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
-
- /*
- * Fill physical nodes with fake nodes of size until there is no memory
- * left on any of them.
- */
- while (!nodes_empty(physnode_mask)) {
- for_each_node_mask(i, physnode_mask) {
- u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
- u64 start, limit, end;
- int phys_blk;
-
- phys_blk = emu_find_memblk_by_nid(i, pi);
- if (phys_blk < 0) {
- node_clear(i, physnode_mask);
- continue;
- }
-
- start = pi->blk[phys_blk].start;
- limit = pi->blk[phys_blk].end;
-
- if (uniform)
- end = start + size;
- else
- end = find_end_of_node(start, limit, size);
- /*
- * If there won't be at least FAKE_NODE_MIN_SIZE of
- * non-reserved memory in ZONE_DMA32 for the next node,
- * this one must extend to the boundary.
- */
- if (end < dma32_end && dma32_end - end -
- mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
- end = dma32_end;
-
- /*
- * If there won't be enough non-reserved memory for the
- * next node, this one must extend to the end of the
- * physical node.
- */
- if ((limit - end - mem_hole_size(end, limit) < size)
- && !uniform)
- end = limit;
-
- ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
- phys_blk,
- min(end, limit) - start);
- if (ret < 0)
- return ret;
- }
- }
- return nid;
-}
-
-static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
- struct numa_meminfo *pi,
- u64 addr, u64 max_addr, u64 size)
-{
- return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
- 0, NULL, 0);
-}
-
-static int __init setup_emu2phys_nid(int *dfl_phys_nid)
-{
- int i, max_emu_nid = 0;
-
- *dfl_phys_nid = NUMA_NO_NODE;
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
- if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
- max_emu_nid = i;
- if (*dfl_phys_nid == NUMA_NO_NODE)
- *dfl_phys_nid = emu_nid_to_phys[i];
- }
- }
-
- return max_emu_nid;
-}
-
-/**
- * numa_emulation - Emulate NUMA nodes
- * @numa_meminfo: NUMA configuration to massage
- * @numa_dist_cnt: The size of the physical NUMA distance table
- *
- * Emulate NUMA nodes according to the numa=fake kernel parameter.
- * @numa_meminfo contains the physical memory configuration and is modified
- * to reflect the emulated configuration on success. @numa_dist_cnt is
- * used to determine the size of the physical distance table.
- *
- * On success, the following modifications are made.
- *
- * - @numa_meminfo is updated to reflect the emulated nodes.
- *
- * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
- * emulated nodes.
- *
- * - NUMA distance table is rebuilt to represent distances between emulated
- * nodes. The distances are determined considering how emulated nodes
- * are mapped to physical nodes and match the actual distances.
- *
- * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
- * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
- *
- * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
- * identity mapping and no other modification is made.
- */
-void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
-{
- static struct numa_meminfo ei __initdata;
- static struct numa_meminfo pi __initdata;
- const u64 max_addr = PFN_PHYS(max_pfn);
- u8 *phys_dist = NULL;
- size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
- int max_emu_nid, dfl_phys_nid;
- int i, j, ret;
-
- if (!emu_cmdline)
- goto no_emu;
-
- memset(&ei, 0, sizeof(ei));
- pi = *numa_meminfo;
-
- for (i = 0; i < MAX_NUMNODES; i++)
- emu_nid_to_phys[i] = NUMA_NO_NODE;
-
- /*
- * If the numa=fake command-line contains a 'M' or 'G', it represents
- * the fixed node size. Otherwise, if it is just a single number N,
- * split the system RAM into N fake nodes.
- */
- if (strchr(emu_cmdline, 'U')) {
- nodemask_t physnode_mask = numa_nodes_parsed;
- unsigned long n;
- int nid = 0;
-
- n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
- ret = -1;
- for_each_node_mask(i, physnode_mask) {
- /*
- * The reason we pass in blk[0] is due to
- * numa_remove_memblk_from() called by
- * emu_setup_memblk() will delete entry 0
- * and then move everything else up in the pi.blk
- * array. Therefore we should always be looking
- * at blk[0].
- */
- ret = split_nodes_size_interleave_uniform(&ei, &pi,
- pi.blk[0].start, pi.blk[0].end, 0,
- n, &pi.blk[0], nid);
- if (ret < 0)
- break;
- if (ret < n) {
- pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
- __func__, i, ret, n);
- ret = -1;
- break;
- }
- nid = ret;
- }
- } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
- u64 size;
-
- size = memparse(emu_cmdline, &emu_cmdline);
- ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
- } else {
- unsigned long n;
-
- n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
- ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
- }
- if (*emu_cmdline == ':')
- emu_cmdline++;
-
- if (ret < 0)
- goto no_emu;
-
- if (numa_cleanup_meminfo(&ei) < 0) {
- pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
- goto no_emu;
- }
-
- /* copy the physical distance table */
- if (numa_dist_cnt) {
- u64 phys;
-
- phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0,
- PFN_PHYS(max_pfn_mapped));
- if (!phys) {
- pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
- goto no_emu;
- }
- phys_dist = __va(phys);
-
- for (i = 0; i < numa_dist_cnt; i++)
- for (j = 0; j < numa_dist_cnt; j++)
- phys_dist[i * numa_dist_cnt + j] =
- node_distance(i, j);
- }
-
- /*
- * Determine the max emulated nid and the default phys nid to use
- * for unmapped nodes.
- */
- max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
-
- /* commit */
- *numa_meminfo = ei;
-
- /* Make sure numa_nodes_parsed only contains emulated nodes */
- nodes_clear(numa_nodes_parsed);
- for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
- if (ei.blk[i].start != ei.blk[i].end &&
- ei.blk[i].nid != NUMA_NO_NODE)
- node_set(ei.blk[i].nid, numa_nodes_parsed);
-
- /*
- * Transform __apicid_to_node table to use emulated nids by
- * reverse-mapping phys_nid. The maps should always exist but fall
- * back to zero just in case.
- */
- for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
- if (__apicid_to_node[i] == NUMA_NO_NODE)
- continue;
- for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
- if (__apicid_to_node[i] == emu_nid_to_phys[j])
- break;
- __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
- }
-
- /* make sure all emulated nodes are mapped to a physical node */
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
- if (emu_nid_to_phys[i] == NUMA_NO_NODE)
- emu_nid_to_phys[i] = dfl_phys_nid;
-
- /* transform distance table */
- numa_reset_distance();
- for (i = 0; i < max_emu_nid + 1; i++) {
- for (j = 0; j < max_emu_nid + 1; j++) {
- int physi = emu_nid_to_phys[i];
- int physj = emu_nid_to_phys[j];
- int dist;
-
- if (get_option(&emu_cmdline, &dist) == 2)
- ;
- else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
- dist = physi == physj ?
- LOCAL_DISTANCE : REMOTE_DISTANCE;
- else
- dist = phys_dist[physi * numa_dist_cnt + physj];
-
- numa_set_distance(i, j, dist);
- }
- }
-
- /* free the copied physical distance table */
- memblock_free(phys_dist, phys_size);
- return;
-
-no_emu:
- /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
- for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
- emu_nid_to_phys[i] = i;
-}
-
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
-void numa_add_cpu(int cpu)
-{
- int physnid, nid;
-
- nid = early_cpu_to_node(cpu);
- BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
-
- physnid = emu_nid_to_phys[nid];
-
- /*
- * Map the cpu to each emulated node that is allocated on the physical
- * node of the cpu's apic id.
- */
- for_each_online_node(nid)
- if (emu_nid_to_phys[nid] == physnid)
- cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
-}
-
-void numa_remove_cpu(int cpu)
-{
- int i;
-
- for_each_online_node(i)
- cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
-}
-#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
-static void numa_set_cpumask(int cpu, bool enable)
-{
- int nid, physnid;
-
- nid = early_cpu_to_node(cpu);
- if (nid == NUMA_NO_NODE) {
- /* early_cpu_to_node() already emits a warning and trace */
- return;
- }
-
- physnid = emu_nid_to_phys[nid];
-
- for_each_online_node(nid) {
- if (emu_nid_to_phys[nid] != physnid)
- continue;
-
- debug_cpumask_set_cpu(cpu, nid, enable);
- }
-}
-
-void numa_add_cpu(int cpu)
-{
- numa_set_cpumask(cpu, true);
-}
-
-void numa_remove_cpu(int cpu)
-{
- numa_set_cpumask(cpu, false);
-}
-#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index 86860f279662..11e1ff370c10 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -5,30 +5,6 @@
#include <linux/types.h>
#include <asm/numa.h>
-struct numa_memblk {
- u64 start;
- u64 end;
- int nid;
-};
-
-struct numa_meminfo {
- int nr_blks;
- struct numa_memblk blk[NR_NODE_MEMBLKS];
-};
-
-void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
-int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
-void __init numa_reset_distance(void);
-
void __init x86_numa_init(void);
-#ifdef CONFIG_NUMA_EMU
-void __init numa_emulation(struct numa_meminfo *numa_meminfo,
- int numa_dist_cnt);
-#else
-static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
- int numa_dist_cnt)
-{ }
-#endif
-
#endif /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 36b603d0cdde..feb8cc6a12bf 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -39,6 +39,7 @@
#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/fs.h>
#include <linux/rbtree.h>
@@ -103,7 +104,7 @@ __setup("debugpat", pat_debug_setup);
#ifdef CONFIG_X86_PAT
/*
- * X86 PAT uses page flags arch_1 and uncached together to keep track of
+ * X86 PAT uses page flags arch_1 and arch_2 together to keep track of
* memory type of pages that have backing page struct.
*
* X86 PAT supports 4 different memory types:
@@ -117,9 +118,9 @@ __setup("debugpat", pat_debug_setup);
#define _PGMT_WB 0
#define _PGMT_WC (1UL << PG_arch_1)
-#define _PGMT_UC_MINUS (1UL << PG_uncached)
-#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1)
-#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_UC_MINUS (1UL << PG_arch_2)
+#define _PGMT_WT (1UL << PG_arch_2 | 1UL << PG_arch_1)
+#define _PGMT_MASK (1UL << PG_arch_2 | 1UL << PG_arch_1)
#define _PGMT_CLEAR_MASK (~_PGMT_MASK)
static inline enum page_cache_mode get_page_memtype(struct page *pg)
@@ -175,15 +176,6 @@ static inline void set_page_memtype(struct page *pg,
}
#endif
-enum {
- PAT_UC = 0, /* uncached */
- PAT_WC = 1, /* Write combining */
- PAT_WT = 4, /* Write Through */
- PAT_WP = 5, /* Write Protected */
- PAT_WB = 6, /* Write Back (default) */
- PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */
-};
-
#define CM(c) (_PAGE_CACHE_MODE_ ## c)
static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val,
@@ -193,13 +185,13 @@ static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val,
char *cache_mode;
switch (pat_val) {
- case PAT_UC: cache = CM(UC); cache_mode = "UC "; break;
- case PAT_WC: cache = CM(WC); cache_mode = "WC "; break;
- case PAT_WT: cache = CM(WT); cache_mode = "WT "; break;
- case PAT_WP: cache = CM(WP); cache_mode = "WP "; break;
- case PAT_WB: cache = CM(WB); cache_mode = "WB "; break;
- case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
- default: cache = CM(WB); cache_mode = "WB "; break;
+ case X86_MEMTYPE_UC: cache = CM(UC); cache_mode = "UC "; break;
+ case X86_MEMTYPE_WC: cache = CM(WC); cache_mode = "WC "; break;
+ case X86_MEMTYPE_WT: cache = CM(WT); cache_mode = "WT "; break;
+ case X86_MEMTYPE_WP: cache = CM(WP); cache_mode = "WP "; break;
+ case X86_MEMTYPE_WB: cache = CM(WB); cache_mode = "WB "; break;
+ case X86_MEMTYPE_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
+ default: cache = CM(WB); cache_mode = "WB "; break;
}
memcpy(msg, cache_mode, 4);
@@ -256,12 +248,6 @@ void pat_cpu_init(void)
void __init pat_bp_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
-#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \
- (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \
- ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \
- ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \
- ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56))
-
if (!IS_ENABLED(CONFIG_X86_PAT))
pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
@@ -292,7 +278,7 @@ void __init pat_bp_init(void)
* NOTE: When WC or WP is used, it is redirected to UC- per
* the default setup in __cachemode2pte_tbl[].
*/
- pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC);
+ pat_msr_val = PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC);
}
/*
@@ -327,7 +313,7 @@ void __init pat_bp_init(void)
* NOTE: When WT or WP is used, it is redirected to UC- per
* the default setup in __cachemode2pte_tbl[].
*/
- pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC);
+ pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC);
} else {
/*
* Full PAT support. We put WT in slot 7 to improve
@@ -355,13 +341,12 @@ void __init pat_bp_init(void)
* The reserved slots are unused, but mapped to their
* corresponding types in the presence of PAT errata.
*/
- pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT);
+ pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT);
}
memory_caching_control |= CACHE_PAT;
init_cache_modes(pat_msr_val);
-#undef PAT
}
static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
@@ -947,6 +932,26 @@ static void free_pfn_range(u64 paddr, unsigned long size)
memtype_free(paddr, paddr + size);
}
+static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
+ resource_size_t *phys)
+{
+ struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start };
+
+ if (follow_pfnmap_start(&args))
+ return -EINVAL;
+
+ /* Never return PFNs of anon folios in COW mappings. */
+ if (!args.special) {
+ follow_pfnmap_end(&args);
+ return -EINVAL;
+ }
+
+ *prot = pgprot_val(args.pgprot);
+ *phys = (resource_size_t)args.pfn << PAGE_SHIFT;
+ follow_pfnmap_end(&args);
+ return 0;
+}
+
static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
pgprot_t *pgprot)
{
@@ -964,7 +969,7 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
* detect the PFN. If we need the cachemode as well, we're out of luck
* for now and have to fail fork().
*/
- if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) {
+ if (!follow_phys(vma, &prot, paddr)) {
if (pgprot)
*pgprot = __pgprot(prot);
return 0;
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 80c9037ffadf..44f7b2ea6a07 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -619,7 +619,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
* Validate strict W^X semantics.
*/
static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
- unsigned long pfn, unsigned long npg)
+ unsigned long pfn, unsigned long npg,
+ bool nx, bool rw)
{
unsigned long end;
@@ -641,6 +642,10 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
return new;
+ /* Non-leaf translation entries can disable writing or execution. */
+ if (!rw || nx)
+ return new;
+
end = start + npg * PAGE_SIZE - 1;
WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
(unsigned long long)pgprot_val(old),
@@ -657,56 +662,82 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
/*
* Lookup the page table entry for a virtual address in a specific pgd.
- * Return a pointer to the entry and the level of the mapping.
+ * Return a pointer to the entry (or NULL if the entry does not exist),
+ * the level of the entry, and the effective NX and RW bits of all
+ * page table levels.
*/
-pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
- unsigned int *level)
+pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
+ unsigned int *level, bool *nx, bool *rw)
{
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
- *level = PG_LEVEL_NONE;
+ *level = PG_LEVEL_256T;
+ *nx = false;
+ *rw = true;
if (pgd_none(*pgd))
return NULL;
+ *level = PG_LEVEL_512G;
+ *nx |= pgd_flags(*pgd) & _PAGE_NX;
+ *rw &= pgd_flags(*pgd) & _PAGE_RW;
+
p4d = p4d_offset(pgd, address);
if (p4d_none(*p4d))
return NULL;
- *level = PG_LEVEL_512G;
if (p4d_leaf(*p4d) || !p4d_present(*p4d))
return (pte_t *)p4d;
+ *level = PG_LEVEL_1G;
+ *nx |= p4d_flags(*p4d) & _PAGE_NX;
+ *rw &= p4d_flags(*p4d) & _PAGE_RW;
+
pud = pud_offset(p4d, address);
if (pud_none(*pud))
return NULL;
- *level = PG_LEVEL_1G;
if (pud_leaf(*pud) || !pud_present(*pud))
return (pte_t *)pud;
+ *level = PG_LEVEL_2M;
+ *nx |= pud_flags(*pud) & _PAGE_NX;
+ *rw &= pud_flags(*pud) & _PAGE_RW;
+
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
return NULL;
- *level = PG_LEVEL_2M;
if (pmd_leaf(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
*level = PG_LEVEL_4K;
+ *nx |= pmd_flags(*pmd) & _PAGE_NX;
+ *rw &= pmd_flags(*pmd) & _PAGE_RW;
return pte_offset_kernel(pmd, address);
}
/*
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
+{
+ bool nx, rw;
+
+ return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
+}
+
+/*
* Lookup the page table entry for a virtual address. Return a pointer
* to the entry and the level of the mapping.
*
- * Note: We return pud and pmd either when the entry is marked large
- * or when the present bit is not set. Otherwise we would return a
- * pointer to a nonexisting mapping.
+ * Note: the function returns p4d, pud or pmd either when the entry is marked
+ * large or when the present bit is not set. Otherwise it returns NULL.
*/
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
@@ -715,13 +746,16 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
EXPORT_SYMBOL_GPL(lookup_address);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
- unsigned int *level)
+ unsigned int *level, bool *nx, bool *rw)
{
- if (cpa->pgd)
- return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
- address, level);
+ pgd_t *pgd;
+
+ if (!cpa->pgd)
+ pgd = pgd_offset_k(address);
+ else
+ pgd = cpa->pgd + pgd_index(address);
- return lookup_address(address, level);
+ return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
}
/*
@@ -849,12 +883,13 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
pgprot_t old_prot, new_prot, req_prot, chk_prot;
pte_t new_pte, *tmp;
enum pg_level level;
+ bool nx, rw;
/*
* Check for races, another CPU might have split this page
* up already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte)
return 1;
@@ -965,7 +1000,8 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
psize, CPA_DETECT);
- new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages);
+ new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
+ nx, rw);
/*
* If there is a conflict, split the large page.
@@ -1046,6 +1082,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
pte_t *pbase = (pte_t *)page_address(base);
unsigned int i, level;
pgprot_t ref_prot;
+ bool nx, rw;
pte_t *tmp;
spin_lock(&pgd_lock);
@@ -1053,7 +1090,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* Check for races, another CPU might have split this page
* up for us already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte) {
spin_unlock(&pgd_lock);
return 1;
@@ -1082,8 +1119,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
lpinc = PMD_SIZE;
/*
* Clear the PSE flags if the PRESENT flag is not set
- * otherwise pmd_present/pmd_huge will return true
- * even on a non present pmd.
+ * otherwise pmd_present() will return true even on a non
+ * present pmd.
*/
if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
pgprot_val(ref_prot) &= ~_PAGE_PSE;
@@ -1594,10 +1631,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
int do_split, err;
unsigned int level;
pte_t *kpte, old_pte;
+ bool nx, rw;
address = __cpa_addr(cpa, cpa->curpage);
repeat:
- kpte = _lookup_address_cpa(cpa, address, &level);
+ kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (!kpte)
return __cpa_process_fault(cpa, address, primary);
@@ -1619,7 +1657,8 @@ repeat:
new_prot = static_protections(new_prot, address, pfn, 1, 0,
CPA_PROTECT);
- new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1);
+ new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
+ nx, rw);
new_prot = pgprot_clear_protnone_bits(new_prot);
@@ -2156,7 +2195,8 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());
/* Notify hypervisor that we are about to set/clr encryption attribute. */
- if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc))
+ ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc);
+ if (ret)
goto vmm_fail;
ret = __change_page_attr_set_clr(&cpa, 1);
@@ -2174,24 +2214,61 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
return ret;
/* Notify hypervisor that we have successfully set/clr encryption attribute. */
- if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc))
+ ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc);
+ if (ret)
goto vmm_fail;
return 0;
vmm_fail:
- WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n",
- (void *)addr, numpages, enc ? "private" : "shared");
+ WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n",
+ (void *)addr, numpages, enc ? "private" : "shared", ret);
+
+ return ret;
+}
+
+/*
+ * The lock serializes conversions between private and shared memory.
+ *
+ * It is taken for read on conversion. A write lock guarantees that no
+ * concurrent conversions are in progress.
+ */
+static DECLARE_RWSEM(mem_enc_lock);
+
+/*
+ * Stop new private<->shared conversions.
+ *
+ * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete.
+ * The lock is not released to prevent new conversions from being started.
+ */
+bool set_memory_enc_stop_conversion(void)
+{
+ /*
+ * In a crash scenario, sleep is not allowed. Try to take the lock.
+ * Failure indicates that there is a race with the conversion.
+ */
+ if (oops_in_progress)
+ return down_write_trylock(&mem_enc_lock);
- return -EIO;
+ down_write(&mem_enc_lock);
+
+ return true;
}
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
- if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
- return __set_memory_enc_pgtable(addr, numpages, enc);
+ int ret = 0;
- return 0;
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+ if (!down_read_trylock(&mem_enc_lock))
+ return -EBUSY;
+
+ ret = __set_memory_enc_pgtable(addr, numpages, enc);
+
+ up_read(&mem_enc_lock);
+ }
+
+ return ret;
}
int set_memory_encrypted(unsigned long addr, int numpages)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d007591b8059..5745a354a241 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -110,7 +110,7 @@ static inline void pgd_list_del(pgd_t *pgd)
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
#define MAX_UNSHARED_PTRS_PER_PGD \
- max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
+ MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
@@ -631,6 +631,8 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+
/*
* No flush is necessary. Once an invalid PTE is established, the PTE's
* access and dirty bits cannot be updated.
@@ -639,6 +641,18 @@ pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
}
#endif
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp)
+{
+ VM_WARN_ON_ONCE(!pud_present(*pudp));
+ pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
+ flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
+ return old;
+}
+#endif
+
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
@@ -731,7 +745,7 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
return 0;
/* Bail out if we are we on a populated non-leaf entry: */
- if (pud_present(*pud) && !pud_huge(*pud))
+ if (pud_present(*pud) && !pud_leaf(*pud))
return 0;
set_pte((pte_t *)pud, pfn_pte(
@@ -760,7 +774,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
}
/* Bail out if we are we on a populated non-leaf entry: */
- if (pmd_present(*pmd) && !pmd_huge(*pmd))
+ if (pmd_present(*pmd) && !pmd_leaf(*pmd))
return 0;
set_pte((pte_t *)pmd, pfn_pte(
@@ -924,3 +938,9 @@ void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
pmd_shstk(pmd));
}
+
+void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
+{
+ /* See note in arch_check_zapped_pte() */
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
+}
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 2e69abf4f852..851ec8f1363a 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -241,7 +241,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
-static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+static pte_t *pti_user_pagetable_walk_pte(unsigned long address, bool late_text)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pmd_t *pmd;
@@ -251,10 +251,15 @@ static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
if (!pmd)
return NULL;
- /* We can't do anything sensible if we hit a large mapping. */
+ /* Large PMD mapping found */
if (pmd_leaf(*pmd)) {
- WARN_ON(1);
- return NULL;
+ /* Clear the PMD if we hit a large mapping from the first round */
+ if (late_text) {
+ set_pmd(pmd, __pmd(0));
+ } else {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
}
if (pmd_none(*pmd)) {
@@ -283,7 +288,7 @@ static void __init pti_setup_vsyscall(void)
if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
return;
- target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR, false);
if (WARN_ON(!target_pte))
return;
@@ -301,7 +306,7 @@ enum pti_clone_level {
static void
pti_clone_pgtable(unsigned long start, unsigned long end,
- enum pti_clone_level level)
+ enum pti_clone_level level, bool late_text)
{
unsigned long addr;
@@ -374,14 +379,14 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
*/
*target_pmd = *pmd;
- addr += PMD_SIZE;
+ addr = round_up(addr + 1, PMD_SIZE);
} else if (level == PTI_CLONE_PTE) {
/* Walk the page-table down to the pte level */
pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
- addr += PAGE_SIZE;
+ addr = round_up(addr + 1, PAGE_SIZE);
continue;
}
@@ -390,7 +395,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
return;
/* Allocate PTE in the user page-table */
- target_pte = pti_user_pagetable_walk_pte(addr);
+ target_pte = pti_user_pagetable_walk_pte(addr, late_text);
if (WARN_ON(!target_pte))
return;
@@ -401,7 +406,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
/* Clone the PTE */
*target_pte = *pte;
- addr += PAGE_SIZE;
+ addr = round_up(addr + 1, PAGE_SIZE);
} else {
BUG();
@@ -452,7 +457,7 @@ static void __init pti_clone_user_shared(void)
phys_addr_t pa = per_cpu_ptr_to_phys((void *)va);
pte_t *target_pte;
- target_pte = pti_user_pagetable_walk_pte(va);
+ target_pte = pti_user_pagetable_walk_pte(va, false);
if (WARN_ON(!target_pte))
return;
@@ -475,7 +480,7 @@ static void __init pti_clone_user_shared(void)
start = CPU_ENTRY_AREA_BASE;
end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES);
- pti_clone_pgtable(start, end, PTI_CLONE_PMD);
+ pti_clone_pgtable(start, end, PTI_CLONE_PMD, false);
}
#endif /* CONFIG_X86_64 */
@@ -492,11 +497,11 @@ static void __init pti_setup_espfix64(void)
/*
* Clone the populated PMDs of the entry text and force it RO.
*/
-static void pti_clone_entry_text(void)
+static void pti_clone_entry_text(bool late)
{
pti_clone_pgtable((unsigned long) __entry_text_start,
(unsigned long) __entry_text_end,
- PTI_CLONE_PMD);
+ PTI_LEVEL_KERNEL_IMAGE, late);
}
/*
@@ -571,7 +576,7 @@ static void pti_clone_kernel_text(void)
* pti_set_kernel_image_nonglobal() did to clear the
* global bit.
*/
- pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE);
+ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE, false);
/*
* pti_clone_pgtable() will set the global bit in any PMDs
@@ -638,8 +643,15 @@ void __init pti_init(void)
/* Undo all global bits from the init pagetables in head_64.S: */
pti_set_kernel_image_nonglobal();
+
/* Replace some of the global bits just for shared entry text: */
- pti_clone_entry_text();
+ /*
+ * This is very early in boot. Device and Late initcalls can do
+ * modprobe before free_initmem() and mark_readonly(). This
+ * pti_clone_entry_text() allows those user-mode-helpers to function,
+ * but notably the text is still RW.
+ */
+ pti_clone_entry_text(false);
pti_setup_espfix64();
pti_setup_vsyscall();
}
@@ -656,10 +668,11 @@ void pti_finalize(void)
if (!boot_cpu_has(X86_FEATURE_PTI))
return;
/*
- * We need to clone everything (again) that maps parts of the
- * kernel image.
+ * This is after free_initmem() (all initcalls are done) and we've done
+ * mark_readonly(). Text is now NX which might've split some PMDs
+ * relative to the early clone.
*/
- pti_clone_entry_text();
+ pti_clone_entry_text(true);
pti_clone_kernel_text();
debug_checkwx_user();
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 9c52a95937ad..6f8e0f21c710 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -57,8 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
}
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
- printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
- pxm, apic_id, node);
+ pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node);
}
/* Callback for Proximity Domain -> LAPIC mapping */
@@ -98,8 +97,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
- printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
- pxm, apic_id, node);
+ pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node);
}
int __init x86_acpi_numa_init(void)
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index bda73cb7a044..ae295659ca14 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -144,3 +144,4 @@ static void __exit cleanup(void)
module_init(init);
module_exit(cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Test module for mmiotrace");
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 44ac64f3a047..86593d1b787d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -11,6 +11,7 @@
#include <linux/sched/smt.h>
#include <linux/task_work.h>
#include <linux/mmu_notifier.h>
+#include <linux/mmu_context.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -85,9 +86,6 @@
*
*/
-/* There are 12 bits of space for ASIDS in CR3 */
-#define CR3_HW_ASID_BITS 12
-
/*
* When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
@@ -160,7 +158,6 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
unsigned long cr3 = __sme_pa(pgd) | lam;
if (static_cpu_has(X86_FEATURE_PCID)) {
- VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
cr3 |= kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
@@ -503,9 +500,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
{
struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
+ unsigned long new_lam;
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
@@ -619,9 +616,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
cpumask_clear_cpu(cpu, mm_cpumask(prev));
}
- /*
- * Start remote flushes and then read tlb_gen.
- */
+ /* Start receiving IPIs and then read tlb_gen (and LAM below) */
if (next != &init_mm)
cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
@@ -633,7 +628,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
barrier();
}
- set_tlbstate_lam_mode(next);
+ new_lam = mm_lam_cr3_mask(next);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
@@ -652,6 +647,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next));
if (next != prev) {
cr4_update_pce_mm(next);
@@ -698,6 +694,7 @@ void initialize_tlbstate_and_flush(void)
int i;
struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
+ unsigned long lam = mm_lam_cr3_mask(mm);
unsigned long cr3 = __read_cr3();
/* Assert that CR3 already references the right mm. */
@@ -705,7 +702,7 @@ void initialize_tlbstate_and_flush(void)
/* LAM expected to be disabled */
WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
- WARN_ON(mm_lam_cr3_mask(mm));
+ WARN_ON(lam);
/*
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
@@ -724,7 +721,7 @@ void initialize_tlbstate_and_flush(void)
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
- set_tlbstate_lam_mode(mm);
+ cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 59cbc94b6e69..06b080b61aa5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -64,6 +64,56 @@ static bool is_imm8(int value)
return value <= 127 && value >= -128;
}
+/*
+ * Let us limit the positive offset to be <= 123.
+ * This is to ensure eventual jit convergence For the following patterns:
+ * ...
+ * pass4, final_proglen=4391:
+ * ...
+ * 20e: 48 85 ff test rdi,rdi
+ * 211: 74 7d je 0x290
+ * 213: 48 8b 77 00 mov rsi,QWORD PTR [rdi+0x0]
+ * ...
+ * 289: 48 85 ff test rdi,rdi
+ * 28c: 74 17 je 0x2a5
+ * 28e: e9 7f ff ff ff jmp 0x212
+ * 293: bf 03 00 00 00 mov edi,0x3
+ * Note that insn at 0x211 is 2-byte cond jump insn for offset 0x7d (-125)
+ * and insn at 0x28e is 5-byte jmp insn with offset -129.
+ *
+ * pass5, final_proglen=4392:
+ * ...
+ * 20e: 48 85 ff test rdi,rdi
+ * 211: 0f 84 80 00 00 00 je 0x297
+ * 217: 48 8b 77 00 mov rsi,QWORD PTR [rdi+0x0]
+ * ...
+ * 28d: 48 85 ff test rdi,rdi
+ * 290: 74 1a je 0x2ac
+ * 292: eb 84 jmp 0x218
+ * 294: bf 03 00 00 00 mov edi,0x3
+ * Note that insn at 0x211 is 6-byte cond jump insn now since its offset
+ * becomes 0x80 based on previous round (0x293 - 0x213 = 0x80).
+ * At the same time, insn at 0x292 is a 2-byte insn since its offset is
+ * -124.
+ *
+ * pass6 will repeat the same code as in pass4 and this will prevent
+ * eventual convergence.
+ *
+ * To fix this issue, we need to break je (2->6 bytes) <-> jmp (5->2 bytes)
+ * cycle in the above. In the above example je offset <= 0x7c should work.
+ *
+ * For other cases, je <-> je needs offset <= 0x7b to avoid no convergence
+ * issue. For jmp <-> je and jmp <-> jmp cases, jmp offset <= 0x7c should
+ * avoid no convergence issue.
+ *
+ * Overall, let us limit the positive offset for 8bit cond/uncond jmp insn
+ * to maximum 123 (0x7b). This way, the jit pass can eventually converge.
+ */
+static bool is_imm8_jmp_offset(int value)
+{
+ return value <= 123 && value >= -128;
+}
+
static bool is_simm32(s64 value)
{
return value == (s64)(s32)value;
@@ -273,7 +323,7 @@ struct jit_context {
/* Number of bytes emit_patch() needs to generate instructions */
#define X86_PATCH_SIZE 5
/* Number of bytes that will be skipped on tailcall */
-#define X86_TAIL_CALL_OFFSET (11 + ENDBR_INSN_SIZE)
+#define X86_TAIL_CALL_OFFSET (12 + ENDBR_INSN_SIZE)
static void push_r12(u8 **pprog)
{
@@ -403,6 +453,37 @@ static void emit_cfi(u8 **pprog, u32 hash)
*pprog = prog;
}
+static void emit_prologue_tail_call(u8 **pprog, bool is_subprog)
+{
+ u8 *prog = *pprog;
+
+ if (!is_subprog) {
+ /* cmp rax, MAX_TAIL_CALL_CNT */
+ EMIT4(0x48, 0x83, 0xF8, MAX_TAIL_CALL_CNT);
+ EMIT2(X86_JA, 6); /* ja 6 */
+ /* rax is tail_call_cnt if <= MAX_TAIL_CALL_CNT.
+ * case1: entry of main prog.
+ * case2: tail callee of main prog.
+ */
+ EMIT1(0x50); /* push rax */
+ /* Make rax as tail_call_cnt_ptr. */
+ EMIT3(0x48, 0x89, 0xE0); /* mov rax, rsp */
+ EMIT2(0xEB, 1); /* jmp 1 */
+ /* rax is tail_call_cnt_ptr if > MAX_TAIL_CALL_CNT.
+ * case: tail callee of subprog.
+ */
+ EMIT1(0x50); /* push rax */
+ /* push tail_call_cnt_ptr */
+ EMIT1(0x50); /* push rax */
+ } else { /* is_subprog */
+ /* rax is tail_call_cnt_ptr. */
+ EMIT1(0x50); /* push rax */
+ EMIT1(0x50); /* push rax */
+ }
+
+ *pprog = prog;
+}
+
/*
* Emit x86-64 prologue code for BPF program.
* bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
@@ -424,10 +505,10 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
/* When it's the entry of the whole tailcall context,
* zeroing rax means initialising tail_call_cnt.
*/
- EMIT2(0x31, 0xC0); /* xor eax, eax */
+ EMIT3(0x48, 0x31, 0xC0); /* xor rax, rax */
else
/* Keep the same instruction layout. */
- EMIT2(0x66, 0x90); /* nop2 */
+ emit_nops(&prog, 3); /* nop3 */
}
/* Exception callback receives FP as third parameter */
if (is_exception_cb) {
@@ -453,7 +534,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8));
if (tail_call_reachable)
- EMIT1(0x50); /* push rax */
+ emit_prologue_tail_call(&prog, is_subprog);
*pprog = prog;
}
@@ -589,13 +670,15 @@ static void emit_return(u8 **pprog, u8 *ip)
*pprog = prog;
}
+#define BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack) (-16 - round_up(stack, 8))
+
/*
* Generate the following code:
*
* ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
* if (index >= array->map.max_entries)
* goto out;
- * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
+ * if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT)
* goto out;
* prog = array->ptrs[index];
* if (prog == NULL)
@@ -608,7 +691,7 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
u32 stack_depth, u8 *ip,
struct jit_context *ctx)
{
- int tcc_off = -4 - round_up(stack_depth, 8);
+ int tcc_ptr_off = BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth);
u8 *prog = *pprog, *start = *pprog;
int offset;
@@ -630,16 +713,14 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
EMIT2(X86_JBE, offset); /* jbe out */
/*
- * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
+ * if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
- EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
- EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
+ EMIT3_off32(0x48, 0x8B, 0x85, tcc_ptr_off); /* mov rax, qword ptr [rbp - tcc_ptr_off] */
+ EMIT4(0x48, 0x83, 0x38, MAX_TAIL_CALL_CNT); /* cmp qword ptr [rax], MAX_TAIL_CALL_CNT */
offset = ctx->tail_call_indirect_label - (prog + 2 - start);
EMIT2(X86_JAE, offset); /* jae out */
- EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
- EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
/* prog = array->ptrs[index]; */
EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6, /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */
@@ -654,6 +735,9 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
offset = ctx->tail_call_indirect_label - (prog + 2 - start);
EMIT2(X86_JE, offset); /* je out */
+ /* Inc tail_call_cnt if the slot is populated. */
+ EMIT4(0x48, 0x83, 0x00, 0x01); /* add qword ptr [rax], 1 */
+
if (bpf_prog->aux->exception_boundary) {
pop_callee_regs(&prog, all_callee_regs_used);
pop_r12(&prog);
@@ -663,6 +747,11 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
pop_r12(&prog);
}
+ /* Pop tail_call_cnt_ptr. */
+ EMIT1(0x58); /* pop rax */
+ /* Pop tail_call_cnt, if it's main prog.
+ * Pop tail_call_cnt_ptr, if it's subprog.
+ */
EMIT1(0x58); /* pop rax */
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, /* add rsp, sd */
@@ -691,21 +780,19 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
bool *callee_regs_used, u32 stack_depth,
struct jit_context *ctx)
{
- int tcc_off = -4 - round_up(stack_depth, 8);
+ int tcc_ptr_off = BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth);
u8 *prog = *pprog, *start = *pprog;
int offset;
/*
- * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
+ * if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
- EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
- EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
+ EMIT3_off32(0x48, 0x8B, 0x85, tcc_ptr_off); /* mov rax, qword ptr [rbp - tcc_ptr_off] */
+ EMIT4(0x48, 0x83, 0x38, MAX_TAIL_CALL_CNT); /* cmp qword ptr [rax], MAX_TAIL_CALL_CNT */
offset = ctx->tail_call_direct_label - (prog + 2 - start);
EMIT2(X86_JAE, offset); /* jae out */
- EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
- EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
poke->tailcall_bypass = ip + (prog - start);
poke->adj_off = X86_TAIL_CALL_OFFSET;
@@ -715,6 +802,9 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE,
poke->tailcall_bypass);
+ /* Inc tail_call_cnt if the slot is populated. */
+ EMIT4(0x48, 0x83, 0x00, 0x01); /* add qword ptr [rax], 1 */
+
if (bpf_prog->aux->exception_boundary) {
pop_callee_regs(&prog, all_callee_regs_used);
pop_r12(&prog);
@@ -724,6 +814,11 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
pop_r12(&prog);
}
+ /* Pop tail_call_cnt_ptr. */
+ EMIT1(0x58); /* pop rax */
+ /* Pop tail_call_cnt, if it's main prog.
+ * Pop tail_call_cnt_ptr, if it's subprog.
+ */
EMIT1(0x58); /* pop rax */
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
@@ -816,9 +911,10 @@ done:
static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
const u32 imm32_hi, const u32 imm32_lo)
{
+ u64 imm64 = ((u64)imm32_hi << 32) | (u32)imm32_lo;
u8 *prog = *pprog;
- if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
+ if (is_uimm32(imm64)) {
/*
* For emitting plain u32, where sign bit must not be
* propagated LLVM tends to load imm64 over mov32
@@ -826,6 +922,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
* 'mov %eax, imm32' instead.
*/
emit_mov_imm32(&prog, false, dst_reg, imm32_lo);
+ } else if (is_simm32(imm64)) {
+ emit_mov_imm32(&prog, true, dst_reg, imm32_lo);
} else {
/* movabsq rax, imm64 */
EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
@@ -1169,6 +1267,54 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
return 0;
}
+static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size,
+ u32 dst_reg, u32 src_reg, u32 index_reg, int off)
+{
+ u8 *prog = *pprog;
+
+ EMIT1(0xF0); /* lock prefix */
+ switch (size) {
+ case BPF_W:
+ EMIT1(add_3mod(0x40, dst_reg, src_reg, index_reg));
+ break;
+ case BPF_DW:
+ EMIT1(add_3mod(0x48, dst_reg, src_reg, index_reg));
+ break;
+ default:
+ pr_err("bpf_jit: 1 and 2 byte atomics are not supported\n");
+ return -EFAULT;
+ }
+
+ /* emit opcode */
+ switch (atomic_op) {
+ case BPF_ADD:
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ /* lock *(u32/u64*)(dst_reg + idx_reg + off) <op>= src_reg */
+ EMIT1(simple_alu_opcodes[atomic_op]);
+ break;
+ case BPF_ADD | BPF_FETCH:
+ /* src_reg = atomic_fetch_add(dst_reg + idx_reg + off, src_reg); */
+ EMIT2(0x0F, 0xC1);
+ break;
+ case BPF_XCHG:
+ /* src_reg = atomic_xchg(dst_reg + idx_reg + off, src_reg); */
+ EMIT1(0x87);
+ break;
+ case BPF_CMPXCHG:
+ /* r0 = atomic_cmpxchg(dst_reg + idx_reg + off, r0, src_reg); */
+ EMIT2(0x0F, 0xB1);
+ break;
+ default:
+ pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
+ return -EFAULT;
+ }
+ emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off);
+ *pprog = prog;
+ return 0;
+}
+
#define DONT_CLEAR 1
bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
@@ -1183,13 +1329,11 @@ bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
}
static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
- bool *regs_used, bool *tail_call_seen)
+ bool *regs_used)
{
int i;
for (i = 1; i <= insn_cnt; i++, insn++) {
- if (insn->code == (BPF_JMP | BPF_TAIL_CALL))
- *tail_call_seen = true;
if (insn->dst_reg == BPF_REG_6 || insn->src_reg == BPF_REG_6)
regs_used[0] = true;
if (insn->dst_reg == BPF_REG_7 || insn->src_reg == BPF_REG_7)
@@ -1262,9 +1406,11 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
-/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
-#define RESTORE_TAIL_CALL_CNT(stack) \
- EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8)
+#define __LOAD_TCC_PTR(off) \
+ EMIT3_off32(0x48, 0x8B, 0x85, off)
+/* mov rax, qword ptr [rbp - rounded_stack_depth - 16] */
+#define LOAD_TAIL_CALL_CNT_PTR(stack) \
+ __LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
int oldproglen, struct jit_context *ctx, bool jmp_padding)
@@ -1273,7 +1419,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
struct bpf_insn *insn = bpf_prog->insnsi;
bool callee_regs_used[4] = {};
int insn_cnt = bpf_prog->len;
- bool tail_call_seen = false;
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
u64 arena_vm_start, user_vm_start;
@@ -1285,11 +1430,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
- detect_reg_usage(insn, insn_cnt, callee_regs_used,
- &tail_call_seen);
-
- /* tail call's presence in current prog implies it is reachable */
- tail_call_reachable |= tail_call_seen;
+ detect_reg_usage(insn, insn_cnt, callee_regs_used);
emit_prologue(&prog, bpf_prog->aux->stack_depth,
bpf_prog_was_classic(bpf_prog), tail_call_reachable,
@@ -1351,8 +1492,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
break;
case BPF_ALU64 | BPF_MOV | BPF_X:
- if (insn->off == BPF_ADDR_SPACE_CAST &&
- insn->imm == 1U << 16) {
+ if (insn_is_cast_user(insn)) {
if (dst_reg != src_reg)
/* 32-bit mov */
emit_mov_reg(&prog, false, dst_reg, src_reg);
@@ -1383,6 +1523,16 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
maybe_emit_mod(&prog, AUX_REG, dst_reg, true);
EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg));
break;
+ } else if (insn_is_mov_percpu_addr(insn)) {
+ /* mov <dst>, <src> (if necessary) */
+ EMIT_mov(dst_reg, src_reg);
+#ifdef CONFIG_SMP
+ /* add <dst>, gs:[<off>] */
+ EMIT2(0x65, add_1mod(0x48, dst_reg));
+ EMIT3(0x03, add_2reg(0x04, 0, dst_reg), 0x25);
+ EMIT((u32)(unsigned long)&this_cpu_off, 4);
+#endif
+ break;
}
fallthrough;
case BPF_ALU | BPF_MOV | BPF_X:
@@ -1963,13 +2113,22 @@ populate_extable:
return err;
break;
+ case BPF_STX | BPF_PROBE_ATOMIC | BPF_W:
+ case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW:
+ start_of_ldx = prog;
+ err = emit_atomic_index(&prog, insn->imm, BPF_SIZE(insn->code),
+ dst_reg, src_reg, X86_REG_R12, insn->off);
+ if (err)
+ return err;
+ goto populate_extable;
+
/* call */
case BPF_JMP | BPF_CALL: {
u8 *ip = image + addrs[i - 1];
func = (u8 *) __bpf_call_base + imm32;
if (tail_call_reachable) {
- RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth);
+ LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
ip += 7;
}
if (!imm32)
@@ -2122,7 +2281,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
return -EFAULT;
}
jmp_offset = addrs[i + insn->off] - addrs[i];
- if (is_imm8(jmp_offset)) {
+ if (is_imm8_jmp_offset(jmp_offset)) {
if (jmp_padding) {
/* To keep the jmp_offset valid, the extra bytes are
* padded before the jump insn, so we subtract the
@@ -2204,7 +2363,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
break;
}
emit_jmp:
- if (is_imm8(jmp_offset)) {
+ if (is_imm8_jmp_offset(jmp_offset)) {
if (jmp_padding) {
/* To avoid breaking jmp_offset, the extra bytes
* are padded before the actual jmp insn, so
@@ -2644,6 +2803,10 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
return 0;
}
+/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
+#define LOAD_TRAMP_TAIL_CALL_CNT_PTR(stack) \
+ __LOAD_TCC_PTR(-round_up(stack, 8) - 8)
+
/* Example:
* __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
* its 'struct btf_func_model' will be nr_args=2
@@ -2764,7 +2927,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
* [ ... ]
* [ stack_arg2 ]
* RBP - arg_stack_off [ stack_arg1 ]
- * RSP [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX
+ * RSP [ tail_call_cnt_ptr ] BPF_TRAMP_F_TAIL_CALL_CTX
*/
/* room for return value of orig_call or fentry prog */
@@ -2893,10 +3056,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
save_args(m, &prog, arg_stack_off, true);
if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
- /* Before calling the original function, restore the
- * tail_call_cnt from stack to rax.
+ /* Before calling the original function, load the
+ * tail_call_cnt_ptr from stack to rax.
*/
- RESTORE_TAIL_CALL_CNT(stack_size);
+ LOAD_TRAMP_TAIL_CALL_CNT_PTR(stack_size);
}
if (flags & BPF_TRAMP_F_ORIG_STACK) {
@@ -2955,10 +3118,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
goto cleanup;
}
} else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
- /* Before running the original function, restore the
- * tail_call_cnt from stack to rax.
+ /* Before running the original function, load the
+ * tail_call_cnt_ptr from stack to rax.
*/
- RESTORE_TAIL_CALL_CNT(stack_size);
+ LOAD_TRAMP_TAIL_CALL_CNT_PTR(stack_size);
}
/* restore return value of orig_call or fentry prog back into RAX */
@@ -2994,12 +3157,9 @@ void arch_free_bpf_trampoline(void *image, unsigned int size)
bpf_prog_pack_free(image, size);
}
-void arch_protect_bpf_trampoline(void *image, unsigned int size)
-{
-}
-
-void arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+int arch_protect_bpf_trampoline(void *image, unsigned int size)
{
+ return 0;
}
int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
@@ -3297,7 +3457,7 @@ out_image:
*
* Both cases are serious bugs and justify WARN_ON.
*/
- if (WARN_ON(bpf_jit_binary_pack_finalize(prog, header, rw_header))) {
+ if (WARN_ON(bpf_jit_binary_pack_finalize(header, rw_header))) {
/* header has been freed */
header = NULL;
goto out_image;
@@ -3359,6 +3519,11 @@ bool bpf_jit_supports_subprog_tailcalls(void)
return true;
}
+bool bpf_jit_supports_percpu_insn(void)
+{
+ return true;
+}
+
void bpf_jit_free(struct bpf_prog *prog)
{
if (prog->jited) {
@@ -3371,7 +3536,7 @@ void bpf_jit_free(struct bpf_prog *prog)
* before freeing it.
*/
if (jit_data) {
- bpf_jit_binary_pack_finalize(prog, jit_data->header,
+ bpf_jit_binary_pack_finalize(jit_data->header,
jit_data->rw_header);
kvfree(jit_data->addrs);
kfree(jit_data);
@@ -3462,6 +3627,21 @@ bool bpf_jit_supports_arena(void)
return true;
}
+bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+ if (!in_arena)
+ return true;
+ switch (insn->code) {
+ case BPF_STX | BPF_ATOMIC | BPF_W:
+ case BPF_STX | BPF_ATOMIC | BPF_DW:
+ if (insn->imm == (BPF_AND | BPF_FETCH) ||
+ insn->imm == (BPF_OR | BPF_FETCH) ||
+ insn->imm == (BPF_XOR | BPF_FETCH))
+ return false;
+ }
+ return true;
+}
+
bool bpf_jit_supports_ptr_xchg(void)
{
return true;
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index c10083a8e68e..de0f9e5f9f73 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2600,8 +2600,7 @@ out_image:
if (bpf_jit_enable > 1)
bpf_jit_dump(prog->len, proglen, pass + 1, image);
- if (image) {
- bpf_jit_binary_lock_ro(header);
+ if (image && !bpf_jit_binary_lock_ro(header)) {
prog->bpf_func = (void *)image;
prog->jited = 1;
prog->jited_len = proglen;
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
index 87313701f069..f5dbd25651e0 100644
--- a/arch/x86/pci/ce4100.c
+++ b/arch/x86/pci/ce4100.c
@@ -35,12 +35,6 @@ struct sim_dev_reg {
struct sim_reg sim_reg;
};
-struct sim_reg_op {
- void (*init)(struct sim_dev_reg *reg);
- void (*read)(struct sim_dev_reg *reg, u32 value);
- void (*write)(struct sim_dev_reg *reg, u32 value);
-};
-
#define MB (1024 * 1024)
#define KB (1024)
#define SIZE_TO_MASK(size) (~(size - 1))
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index b33afb240601..98a9bb92d75c 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -980,7 +980,7 @@ static void amd_rp_pme_suspend(struct pci_dev *dev)
return;
rp = pcie_find_root_port(dev);
- if (!rp->pm_cap)
+ if (!rp || !rp->pm_cap)
return;
rp->pme_support &= ~((PCI_PM_CAP_PME_D3hot|PCI_PM_CAP_PME_D3cold) >>
@@ -994,7 +994,7 @@ static void amd_rp_pme_resume(struct pci_dev *dev)
u16 pmc;
rp = pcie_find_root_port(dev);
- if (!rp->pm_cap)
+ if (!rp || !rp->pm_cap)
return;
pci_read_config_word(rp, rp->pm_cap + PCI_PM_PMC, &pmc);
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 8edd62206604..b433b1753016 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -216,7 +216,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
}
static const struct x86_cpu_id intel_mid_cpu_ids[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, NULL),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, NULL),
{}
};
@@ -233,9 +233,9 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
return 0;
ret = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
- if (ret < 0) {
+ if (ret) {
dev_warn(&dev->dev, "Failed to read interrupt line: %d\n", ret);
- return ret;
+ return pcibios_err_to_errno(ret);
}
id = x86_match_cpu(intel_mid_cpu_ids);
@@ -243,7 +243,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
model = id->model;
switch (model) {
- case INTEL_FAM6_ATOM_SILVERMONT_MID:
+ case VFM_MODEL(INTEL_ATOM_SILVERMONT_MID):
polarity_low = false;
/* Special treatment for IRQ0 */
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 0cc9520666ef..39255f0eb14d 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -518,7 +518,34 @@ static bool __ref pci_mmcfg_reserved(struct device *dev,
{
struct resource *conflict;
- if (!early && !acpi_disabled) {
+ if (early) {
+
+ /*
+ * Don't try to do this check unless configuration type 1
+ * is available. How about type 2?
+ */
+
+ /*
+ * 946f2ee5c731 ("Check that MCFG points to an e820
+ * reserved area") added this E820 check in 2006 to work
+ * around BIOS defects.
+ *
+ * Per PCI Firmware r3.3, sec 4.1.2, ECAM space must be
+ * reserved by a PNP0C02 resource, but it need not be
+ * mentioned in E820. Before the ACPI interpreter is
+ * available, we can't check for PNP0C02 resources, so
+ * there's no reliable way to verify the region in this
+ * early check. Keep it only for the old machines that
+ * motivated 946f2ee5c731.
+ */
+ if (dmi_get_bios_year() < 2016 && raw_pci_ops)
+ return is_mmconf_reserved(e820__mapped_all, cfg, dev,
+ "E820 entry");
+
+ return true;
+ }
+
+ if (!acpi_disabled) {
if (is_mmconf_reserved(is_acpi_reserved, cfg, dev,
"ACPI motherboard resource"))
return true;
@@ -551,16 +578,7 @@ static bool __ref pci_mmcfg_reserved(struct device *dev,
* For MCFG information constructed from hotpluggable host bridge's
* _CBA method, just assume it's reserved.
*/
- if (pci_mmcfg_running_state)
- return true;
-
- /* Don't try to do this check unless configuration
- type 1 is available. how about type 2 ?*/
- if (raw_pci_ops)
- return is_mmconf_reserved(e820__mapped_all, cfg, dev,
- "E820 entry");
-
- return false;
+ return pci_mmcfg_running_state;
}
static void __init pci_mmcfg_reject_broken(int early)
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index f3aab76e357a..4b18c6404363 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -154,9 +154,6 @@ static const uint32_t ehci_hdr[] = { /* dev f function 4 - devfn = 7d */
0x0, 0x40, 0x0, 0x40a, /* CapPtr INT-D, IRQA */
0xc8020001, 0x0, 0x0, 0x0, /* Capabilities - 40 is R/O, 44 is
mask 8103 (power control) */
-#if 0
- 0x1, 0x40080000, 0x0, 0x0, /* EECP - see EHCI spec section 2.1.7 */
-#endif
0x01000001, 0x0, 0x0, 0x0, /* EECP - see EHCI spec section 2.1.7 */
0x2020, 0x0, 0x0, 0x0, /* (EHCI page 8) 60 SBRN (R/O),
61 FLADJ (R/W), PORTWAKECAP */
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 652cd53e77f6..0f2fe524f60d 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -38,10 +38,10 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev)
u8 gsi;
rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
- if (rc < 0) {
+ if (rc) {
dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
rc);
- return rc;
+ return pcibios_err_to_errno(rc);
}
/* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/
pirq = gsi;
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
index 6b9c6deca8ba..44c30ce6360a 100644
--- a/arch/x86/platform/atom/punit_atom_debug.c
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -165,14 +165,13 @@ static void punit_s2idle_check_register(struct punit_device *punit_device) {}
static void punit_s2idle_check_unregister(void) {}
#endif
-#define X86_MATCH(model, data) \
- X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \
- X86_FEATURE_MWAIT, data)
+#define X86_MATCH(vfm, data) \
+ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_MWAIT, data)
static const struct x86_cpu_id intel_punit_cpu_ids[] = {
- X86_MATCH(ATOM_SILVERMONT, &punit_device_byt),
- X86_MATCH(ATOM_SILVERMONT_MID, &punit_device_tng),
- X86_MATCH(ATOM_AIRMONT, &punit_device_cht),
+ X86_MATCH(INTEL_ATOM_SILVERMONT, &punit_device_byt),
+ X86_MATCH(INTEL_ATOM_SILVERMONT_MID, &punit_device_tng),
+ X86_MATCH(INTEL_ATOM_AIRMONT, &punit_device_cht),
{}
};
MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids);
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index f32451bdcfdd..f8126821a94d 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -139,7 +139,6 @@ void __init x86_ce4100_early_setup(void)
x86_init.resources.probe_roms = x86_init_noop;
x86_init.mpparse.find_mptable = x86_init_noop;
x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
- x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config;
x86_init.pci.init = ce4100_pci_init;
x86_init.pci.init_irq = sdv_pci_init;
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index 543df9a1379d..500cab4a7f7c 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -5,5 +5,4 @@ GCOV_PROFILE := n
obj-$(CONFIG_EFI) += memmap.o quirks.o efi.o efi_$(BITS).o \
efi_stub_$(BITS).o
obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
-obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o
obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index f090ec972d7b..88a96816de9a 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -226,8 +226,6 @@ int __init efi_memblock_x86_reserve_range(void)
if (add_efi_memmap || do_efi_soft_reserve())
do_add_efi_memmap();
- efi_fake_memmap_early();
-
WARN(efi.memmap.desc_version != 1,
"Unexpected EFI_MEMORY_DESCRIPTOR version %ld",
efi.memmap.desc_version);
diff --git a/arch/x86/platform/efi/fake_mem.c b/arch/x86/platform/efi/fake_mem.c
deleted file mode 100644
index 41d57cad3d84..000000000000
--- a/arch/x86/platform/efi/fake_mem.c
+++ /dev/null
@@ -1,197 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * fake_mem.c
- *
- * Copyright (C) 2015 FUJITSU LIMITED
- * Author: Taku Izumi <izumi.taku@jp.fujitsu.com>
- *
- * This code introduces new boot option named "efi_fake_mem"
- * By specifying this parameter, you can add arbitrary attribute to
- * specific memory range by updating original (firmware provided) EFI
- * memmap.
- */
-
-#include <linux/kernel.h>
-#include <linux/efi.h>
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/types.h>
-#include <linux/sort.h>
-#include <asm/e820/api.h>
-#include <asm/efi.h>
-
-#define EFI_MAX_FAKEMEM CONFIG_EFI_MAX_FAKE_MEM
-
-static struct efi_mem_range efi_fake_mems[EFI_MAX_FAKEMEM];
-static int nr_fake_mem;
-
-static int __init cmp_fake_mem(const void *x1, const void *x2)
-{
- const struct efi_mem_range *m1 = x1;
- const struct efi_mem_range *m2 = x2;
-
- if (m1->range.start < m2->range.start)
- return -1;
- if (m1->range.start > m2->range.start)
- return 1;
- return 0;
-}
-
-static void __init efi_fake_range(struct efi_mem_range *efi_range)
-{
- struct efi_memory_map_data data = { 0 };
- int new_nr_map = efi.memmap.nr_map;
- efi_memory_desc_t *md;
- void *new_memmap;
-
- /* count up the number of EFI memory descriptor */
- for_each_efi_memory_desc(md)
- new_nr_map += efi_memmap_split_count(md, &efi_range->range);
-
- /* allocate memory for new EFI memmap */
- if (efi_memmap_alloc(new_nr_map, &data) != 0)
- return;
-
- /* create new EFI memmap */
- new_memmap = early_memremap(data.phys_map, data.size);
- if (!new_memmap) {
- __efi_memmap_free(data.phys_map, data.size, data.flags);
- return;
- }
-
- efi_memmap_insert(&efi.memmap, new_memmap, efi_range);
-
- /* swap into new EFI memmap */
- early_memunmap(new_memmap, data.size);
-
- efi_memmap_install(&data);
-}
-
-void __init efi_fake_memmap(void)
-{
- int i;
-
- if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
- return;
-
- for (i = 0; i < nr_fake_mem; i++)
- efi_fake_range(&efi_fake_mems[i]);
-
- /* print new EFI memmap */
- efi_print_memmap();
-}
-
-static int __init setup_fake_mem(char *p)
-{
- u64 start = 0, mem_size = 0, attribute = 0;
- int i;
-
- if (!p)
- return -EINVAL;
-
- while (*p != '\0') {
- mem_size = memparse(p, &p);
- if (*p == '@')
- start = memparse(p+1, &p);
- else
- break;
-
- if (*p == ':')
- attribute = simple_strtoull(p+1, &p, 0);
- else
- break;
-
- if (nr_fake_mem >= EFI_MAX_FAKEMEM)
- break;
-
- efi_fake_mems[nr_fake_mem].range.start = start;
- efi_fake_mems[nr_fake_mem].range.end = start + mem_size - 1;
- efi_fake_mems[nr_fake_mem].attribute = attribute;
- nr_fake_mem++;
-
- if (*p == ',')
- p++;
- }
-
- sort(efi_fake_mems, nr_fake_mem, sizeof(struct efi_mem_range),
- cmp_fake_mem, NULL);
-
- for (i = 0; i < nr_fake_mem; i++)
- pr_info("efi_fake_mem: add attr=0x%016llx to [mem 0x%016llx-0x%016llx]",
- efi_fake_mems[i].attribute, efi_fake_mems[i].range.start,
- efi_fake_mems[i].range.end);
-
- return *p == '\0' ? 0 : -EINVAL;
-}
-
-early_param("efi_fake_mem", setup_fake_mem);
-
-void __init efi_fake_memmap_early(void)
-{
- int i;
-
- /*
- * The late efi_fake_mem() call can handle all requests if
- * EFI_MEMORY_SP support is disabled.
- */
- if (!efi_soft_reserve_enabled())
- return;
-
- if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
- return;
-
- /*
- * Given that efi_fake_memmap() needs to perform memblock
- * allocations it needs to run after e820__memblock_setup().
- * However, if efi_fake_mem specifies EFI_MEMORY_SP for a given
- * address range that potentially needs to mark the memory as
- * reserved prior to e820__memblock_setup(). Update e820
- * directly if EFI_MEMORY_SP is specified for an
- * EFI_CONVENTIONAL_MEMORY descriptor.
- */
- for (i = 0; i < nr_fake_mem; i++) {
- struct efi_mem_range *mem = &efi_fake_mems[i];
- efi_memory_desc_t *md;
- u64 m_start, m_end;
-
- if ((mem->attribute & EFI_MEMORY_SP) == 0)
- continue;
-
- m_start = mem->range.start;
- m_end = mem->range.end;
- for_each_efi_memory_desc(md) {
- u64 start, end, size;
-
- if (md->type != EFI_CONVENTIONAL_MEMORY)
- continue;
-
- start = md->phys_addr;
- end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1;
-
- if (m_start <= end && m_end >= start)
- /* fake range overlaps descriptor */;
- else
- continue;
-
- /*
- * Trim the boundary of the e820 update to the
- * descriptor in case the fake range overlaps
- * !EFI_CONVENTIONAL_MEMORY
- */
- start = max(start, m_start);
- end = min(end, m_end);
- size = end - start + 1;
-
- if (end <= start)
- continue;
-
- /*
- * Ensure each efi_fake_mem instance results in
- * a unique e820 resource
- */
- e820__range_remove(start, size, E820_TYPE_RAM, 1);
- e820__range_add(start, size, E820_TYPE_SOFT_RESERVED);
- e820__update_table(e820_table);
- }
- }
-}
diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c
index 4ef20b49eb5e..061b8ecc71a1 100644
--- a/arch/x86/platform/efi/memmap.c
+++ b/arch/x86/platform/efi/memmap.c
@@ -30,6 +30,7 @@ static phys_addr_t __init __efi_memmap_alloc_late(unsigned long size)
return PFN_PHYS(page_to_pfn(p));
}
+static
void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags)
{
if (flags & EFI_MEMMAP_MEMBLOCK) {
@@ -92,12 +93,22 @@ int __init efi_memmap_alloc(unsigned int num_entries,
*/
int __init efi_memmap_install(struct efi_memory_map_data *data)
{
+ unsigned long size = efi.memmap.desc_size * efi.memmap.nr_map;
+ unsigned long flags = efi.memmap.flags;
+ u64 phys = efi.memmap.phys_map;
+ int ret;
+
efi_memmap_unmap();
if (efi_enabled(EFI_PARAVIRT))
return 0;
- return __efi_memmap_init(data);
+ ret = __efi_memmap_init(data);
+ if (ret)
+ return ret;
+
+ __efi_memmap_free(phys, size, flags);
+ return 0;
}
/**
diff --git a/arch/x86/platform/geode/Makefile b/arch/x86/platform/geode/Makefile
index a8a6b1dedb01..34b53e97a0ad 100644
--- a/arch/x86/platform/geode/Makefile
+++ b/arch/x86/platform/geode/Makefile
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_GEODE_COMMON) += geode-common.o
obj-$(CONFIG_ALIX) += alix.o
obj-$(CONFIG_NET5501) += net5501.o
obj-$(CONFIG_GEOS) += geos.o
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index b39bf3b5e108..be65cd704e21 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -18,15 +18,12 @@
#include <linux/io.h>
#include <linux/string.h>
#include <linux/moduleparam.h>
-#include <linux/leds.h>
-#include <linux/platform_device.h>
-#include <linux/input.h>
-#include <linux/gpio_keys.h>
-#include <linux/gpio/machine.h>
#include <linux/dmi.h>
#include <asm/geode.h>
+#include "geode-common.h"
+
#define BIOS_SIGNATURE_TINYBIOS 0xf0000
#define BIOS_SIGNATURE_COREBOOT 0x500
#define BIOS_REGION_SIZE 0x10000
@@ -41,79 +38,16 @@ module_param(force, bool, 0444);
/* FIXME: Award bios is not automatically detected as Alix platform */
MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform");
-static struct gpio_keys_button alix_gpio_buttons[] = {
- {
- .code = KEY_RESTART,
- .gpio = 24,
- .active_low = 1,
- .desc = "Reset button",
- .type = EV_KEY,
- .wakeup = 0,
- .debounce_interval = 100,
- .can_disable = 0,
- }
-};
-static struct gpio_keys_platform_data alix_buttons_data = {
- .buttons = alix_gpio_buttons,
- .nbuttons = ARRAY_SIZE(alix_gpio_buttons),
- .poll_interval = 20,
-};
-
-static struct platform_device alix_buttons_dev = {
- .name = "gpio-keys-polled",
- .id = 1,
- .dev = {
- .platform_data = &alix_buttons_data,
- }
-};
-
-static struct gpio_led alix_leds[] = {
- {
- .name = "alix:1",
- .default_trigger = "default-on",
- },
- {
- .name = "alix:2",
- .default_trigger = "default-off",
- },
- {
- .name = "alix:3",
- .default_trigger = "default-off",
- },
-};
-
-static struct gpio_led_platform_data alix_leds_data = {
- .num_leds = ARRAY_SIZE(alix_leds),
- .leds = alix_leds,
-};
-
-static struct gpiod_lookup_table alix_leds_gpio_table = {
- .dev_id = "leds-gpio",
- .table = {
- /* The Geode GPIOs should be on the CS5535 companion chip */
- GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW),
- GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW),
- GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW),
- { }
- },
-};
-
-static struct platform_device alix_leds_dev = {
- .name = "leds-gpio",
- .id = -1,
- .dev.platform_data = &alix_leds_data,
-};
-
-static struct platform_device *alix_devs[] __initdata = {
- &alix_buttons_dev,
- &alix_leds_dev,
+static const struct geode_led alix_leds[] __initconst = {
+ { 6, true },
+ { 25, false },
+ { 27, false },
};
static void __init register_alix(void)
{
- /* Setup LED control through leds-gpio driver */
- gpiod_add_lookup_table(&alix_leds_gpio_table);
- platform_add_devices(alix_devs, ARRAY_SIZE(alix_devs));
+ geode_create_restart_key(24);
+ geode_create_leds("alix", alix_leds, ARRAY_SIZE(alix_leds));
}
static bool __init alix_present(unsigned long bios_phys,
diff --git a/arch/x86/platform/geode/geode-common.c b/arch/x86/platform/geode/geode-common.c
new file mode 100644
index 000000000000..8fd78e60bf15
--- /dev/null
+++ b/arch/x86/platform/geode/geode-common.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared helpers to register GPIO-connected buttons and LEDs
+ * on AMD Geode boards.
+ */
+
+#include <linux/err.h>
+#include <linux/gpio/machine.h>
+#include <linux/gpio/property.h>
+#include <linux/input.h>
+#include <linux/leds.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#include "geode-common.h"
+
+static const struct software_node geode_gpiochip_node = {
+ .name = "cs5535-gpio",
+};
+
+static const struct property_entry geode_gpio_keys_props[] = {
+ PROPERTY_ENTRY_U32("poll-interval", 20),
+ { }
+};
+
+static const struct software_node geode_gpio_keys_node = {
+ .name = "geode-gpio-keys",
+ .properties = geode_gpio_keys_props,
+};
+
+static struct property_entry geode_restart_key_props[] = {
+ { /* Placeholder for GPIO property */ },
+ PROPERTY_ENTRY_U32("linux,code", KEY_RESTART),
+ PROPERTY_ENTRY_STRING("label", "Reset button"),
+ PROPERTY_ENTRY_U32("debounce-interval", 100),
+ { }
+};
+
+static const struct software_node geode_restart_key_node = {
+ .parent = &geode_gpio_keys_node,
+ .properties = geode_restart_key_props,
+};
+
+static const struct software_node *geode_gpio_keys_swnodes[] __initconst = {
+ &geode_gpiochip_node,
+ &geode_gpio_keys_node,
+ &geode_restart_key_node,
+ NULL
+};
+
+/*
+ * Creates gpio-keys-polled device for the restart key.
+ *
+ * Note that it needs to be called first, before geode_create_leds(),
+ * because it registers gpiochip software node used by both gpio-keys and
+ * leds-gpio devices.
+ */
+int __init geode_create_restart_key(unsigned int pin)
+{
+ struct platform_device_info keys_info = {
+ .name = "gpio-keys-polled",
+ .id = 1,
+ };
+ struct platform_device *pd;
+ int err;
+
+ geode_restart_key_props[0] = PROPERTY_ENTRY_GPIO("gpios",
+ &geode_gpiochip_node,
+ pin, GPIO_ACTIVE_LOW);
+
+ err = software_node_register_node_group(geode_gpio_keys_swnodes);
+ if (err) {
+ pr_err("failed to register gpio-keys software nodes: %d\n", err);
+ return err;
+ }
+
+ keys_info.fwnode = software_node_fwnode(&geode_gpio_keys_node);
+
+ pd = platform_device_register_full(&keys_info);
+ err = PTR_ERR_OR_ZERO(pd);
+ if (err) {
+ pr_err("failed to create gpio-keys device: %d\n", err);
+ software_node_unregister_node_group(geode_gpio_keys_swnodes);
+ return err;
+ }
+
+ return 0;
+}
+
+static const struct software_node geode_gpio_leds_node = {
+ .name = "geode-leds",
+};
+
+#define MAX_LEDS 3
+
+int __init geode_create_leds(const char *label, const struct geode_led *leds,
+ unsigned int n_leds)
+{
+ const struct software_node *group[MAX_LEDS + 2] = { 0 };
+ struct software_node *swnodes;
+ struct property_entry *props;
+ struct platform_device_info led_info = {
+ .name = "leds-gpio",
+ .id = PLATFORM_DEVID_NONE,
+ };
+ struct platform_device *led_dev;
+ const char *node_name;
+ int err;
+ int i;
+
+ if (n_leds > MAX_LEDS) {
+ pr_err("%s: too many LEDs\n", __func__);
+ return -EINVAL;
+ }
+
+ swnodes = kcalloc(n_leds, sizeof(*swnodes), GFP_KERNEL);
+ if (!swnodes)
+ return -ENOMEM;
+
+ /*
+ * Each LED is represented by 3 properties: "gpios",
+ * "linux,default-trigger", and am empty terminator.
+ */
+ props = kcalloc(n_leds * 3, sizeof(*props), GFP_KERNEL);
+ if (!props) {
+ err = -ENOMEM;
+ goto err_free_swnodes;
+ }
+
+ group[0] = &geode_gpio_leds_node;
+ for (i = 0; i < n_leds; i++) {
+ node_name = kasprintf(GFP_KERNEL, "%s:%d", label, i);
+ if (!node_name) {
+ err = -ENOMEM;
+ goto err_free_names;
+ }
+
+ props[i * 3 + 0] =
+ PROPERTY_ENTRY_GPIO("gpios", &geode_gpiochip_node,
+ leds[i].pin, GPIO_ACTIVE_LOW);
+ props[i * 3 + 1] =
+ PROPERTY_ENTRY_STRING("linux,default-trigger",
+ leds[i].default_on ?
+ "default-on" : "default-off");
+ /* props[i * 3 + 2] is an empty terminator */
+
+ swnodes[i] = SOFTWARE_NODE(node_name, &props[i * 3],
+ &geode_gpio_leds_node);
+ group[i + 1] = &swnodes[i];
+ }
+
+ err = software_node_register_node_group(group);
+ if (err) {
+ pr_err("failed to register LED software nodes: %d\n", err);
+ goto err_free_names;
+ }
+
+ led_info.fwnode = software_node_fwnode(&geode_gpio_leds_node);
+
+ led_dev = platform_device_register_full(&led_info);
+ err = PTR_ERR_OR_ZERO(led_dev);
+ if (err) {
+ pr_err("failed to create LED device: %d\n", err);
+ goto err_unregister_group;
+ }
+
+ return 0;
+
+err_unregister_group:
+ software_node_unregister_node_group(group);
+err_free_names:
+ while (--i >= 0)
+ kfree(swnodes[i].name);
+ kfree(props);
+err_free_swnodes:
+ kfree(swnodes);
+ return err;
+}
diff --git a/arch/x86/platform/geode/geode-common.h b/arch/x86/platform/geode/geode-common.h
new file mode 100644
index 000000000000..9e0afd34bfad
--- /dev/null
+++ b/arch/x86/platform/geode/geode-common.h
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Shared helpers to register GPIO-connected buttons and LEDs
+ * on AMD Geode boards.
+ */
+
+#ifndef __PLATFORM_GEODE_COMMON_H
+#define __PLATFORM_GEODE_COMMON_H
+
+#include <linux/property.h>
+
+struct geode_led {
+ unsigned int pin;
+ bool default_on;
+};
+
+int geode_create_restart_key(unsigned int pin);
+int geode_create_leds(const char *label, const struct geode_led *leds,
+ unsigned int n_leds);
+
+#endif /* __PLATFORM_GEODE_COMMON_H */
diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
index d263528c90bb..98027fb1ec32 100644
--- a/arch/x86/platform/geode/geos.c
+++ b/arch/x86/platform/geode/geos.c
@@ -16,88 +16,22 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/string.h>
-#include <linux/leds.h>
-#include <linux/platform_device.h>
-#include <linux/input.h>
-#include <linux/gpio_keys.h>
-#include <linux/gpio/machine.h>
#include <linux/dmi.h>
#include <asm/geode.h>
-static struct gpio_keys_button geos_gpio_buttons[] = {
- {
- .code = KEY_RESTART,
- .gpio = 3,
- .active_low = 1,
- .desc = "Reset button",
- .type = EV_KEY,
- .wakeup = 0,
- .debounce_interval = 100,
- .can_disable = 0,
- }
-};
-static struct gpio_keys_platform_data geos_buttons_data = {
- .buttons = geos_gpio_buttons,
- .nbuttons = ARRAY_SIZE(geos_gpio_buttons),
- .poll_interval = 20,
-};
-
-static struct platform_device geos_buttons_dev = {
- .name = "gpio-keys-polled",
- .id = 1,
- .dev = {
- .platform_data = &geos_buttons_data,
- }
-};
-
-static struct gpio_led geos_leds[] = {
- {
- .name = "geos:1",
- .default_trigger = "default-on",
- },
- {
- .name = "geos:2",
- .default_trigger = "default-off",
- },
- {
- .name = "geos:3",
- .default_trigger = "default-off",
- },
-};
-
-static struct gpio_led_platform_data geos_leds_data = {
- .num_leds = ARRAY_SIZE(geos_leds),
- .leds = geos_leds,
-};
-
-static struct gpiod_lookup_table geos_leds_gpio_table = {
- .dev_id = "leds-gpio",
- .table = {
- /* The Geode GPIOs should be on the CS5535 companion chip */
- GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW),
- GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW),
- GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW),
- { }
- },
-};
-
-static struct platform_device geos_leds_dev = {
- .name = "leds-gpio",
- .id = -1,
- .dev.platform_data = &geos_leds_data,
-};
+#include "geode-common.h"
-static struct platform_device *geos_devs[] __initdata = {
- &geos_buttons_dev,
- &geos_leds_dev,
+static const struct geode_led geos_leds[] __initconst = {
+ { 6, true },
+ { 25, false },
+ { 27, false },
};
static void __init register_geos(void)
{
- /* Setup LED control through leds-gpio driver */
- gpiod_add_lookup_table(&geos_leds_gpio_table);
- platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs));
+ geode_create_restart_key(3);
+ geode_create_leds("geos", geos_leds, ARRAY_SIZE(geos_leds));
}
static int __init geos_init(void)
diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c
index 558384acd777..c9cee7dea99b 100644
--- a/arch/x86/platform/geode/net5501.c
+++ b/arch/x86/platform/geode/net5501.c
@@ -16,80 +16,25 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/string.h>
-#include <linux/leds.h>
-#include <linux/platform_device.h>
#include <linux/input.h>
-#include <linux/gpio_keys.h>
#include <linux/gpio/machine.h>
+#include <linux/gpio/property.h>
#include <asm/geode.h>
+#include "geode-common.h"
+
#define BIOS_REGION_BASE 0xffff0000
#define BIOS_REGION_SIZE 0x00010000
-static struct gpio_keys_button net5501_gpio_buttons[] = {
- {
- .code = KEY_RESTART,
- .gpio = 24,
- .active_low = 1,
- .desc = "Reset button",
- .type = EV_KEY,
- .wakeup = 0,
- .debounce_interval = 100,
- .can_disable = 0,
- }
-};
-static struct gpio_keys_platform_data net5501_buttons_data = {
- .buttons = net5501_gpio_buttons,
- .nbuttons = ARRAY_SIZE(net5501_gpio_buttons),
- .poll_interval = 20,
-};
-
-static struct platform_device net5501_buttons_dev = {
- .name = "gpio-keys-polled",
- .id = 1,
- .dev = {
- .platform_data = &net5501_buttons_data,
- }
-};
-
-static struct gpio_led net5501_leds[] = {
- {
- .name = "net5501:1",
- .default_trigger = "default-on",
- },
-};
-
-static struct gpio_led_platform_data net5501_leds_data = {
- .num_leds = ARRAY_SIZE(net5501_leds),
- .leds = net5501_leds,
-};
-
-static struct gpiod_lookup_table net5501_leds_gpio_table = {
- .dev_id = "leds-gpio",
- .table = {
- /* The Geode GPIOs should be on the CS5535 companion chip */
- GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_HIGH),
- { }
- },
-};
-
-static struct platform_device net5501_leds_dev = {
- .name = "leds-gpio",
- .id = -1,
- .dev.platform_data = &net5501_leds_data,
-};
-
-static struct platform_device *net5501_devs[] __initdata = {
- &net5501_buttons_dev,
- &net5501_leds_dev,
+static const struct geode_led net5501_leds[] __initconst = {
+ { 6, true },
};
static void __init register_net5501(void)
{
- /* Setup LED control through leds-gpio driver */
- gpiod_add_lookup_table(&net5501_leds_gpio_table);
- platform_add_devices(net5501_devs, ARRAY_SIZE(net5501_devs));
+ geode_create_restart_key(24);
+ geode_create_leds("net5501", net5501_leds, ARRAY_SIZE(net5501_leds));
}
struct net5501_board {
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 7be71c2cdc83..a8e75f8c14fd 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -22,13 +22,15 @@
#include <asm/mpspec_def.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
#include <asm/io_apic.h>
#include <asm/intel-mid.h>
#include <asm/io.h>
#include <asm/i8259.h>
-#include <asm/intel_scu_ipc.h>
#include <asm/reboot.h>
+#include <linux/platform_data/x86/intel_scu_ipc.h>
+
#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */
#define IPCMSG_COLD_RESET 0xF1
@@ -55,9 +57,8 @@ static void __init intel_mid_time_init(void)
static void intel_mid_arch_setup(void)
{
- switch (boot_cpu_data.x86_model) {
- case 0x3C:
- case 0x4A:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_SILVERMONT_MID:
x86_platform.legacy.rtc = 1;
break;
default:
diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c
index fdd49d70b437..c81cea208c2c 100644
--- a/arch/x86/platform/intel/iosf_mbi.c
+++ b/arch/x86/platform/intel/iosf_mbi.c
@@ -62,7 +62,7 @@ static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
fail_read:
dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
- return result;
+ return pcibios_err_to_errno(result);
}
static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
@@ -91,7 +91,7 @@ static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
fail_write:
dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
- return result;
+ return pcibios_err_to_errno(result);
}
int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index b42bfdab01a9..c5f3bbdbdcfe 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -62,11 +62,10 @@ static int iris_probe(struct platform_device *pdev)
return 0;
}
-static int iris_remove(struct platform_device *pdev)
+static void iris_remove(struct platform_device *pdev)
{
pm_power_off = old_pm_power_off;
printk(KERN_INFO "Iris power_off handler uninstalled.\n");
- return 0;
}
static struct platform_driver iris_driver = {
@@ -74,7 +73,7 @@ static struct platform_driver iris_driver = {
.name = "iris",
},
.probe = iris_probe,
- .remove = iris_remove,
+ .remove_new = iris_remove,
};
static struct resource iris_resources[] = {
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index f067ac780ba7..6a9c42de74e7 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -144,7 +144,7 @@ static int xo1_pm_probe(struct platform_device *pdev)
return 0;
}
-static int xo1_pm_remove(struct platform_device *pdev)
+static void xo1_pm_remove(struct platform_device *pdev)
{
if (strcmp(pdev->name, "cs5535-pms") == 0)
pms_base = 0;
@@ -152,7 +152,6 @@ static int xo1_pm_remove(struct platform_device *pdev)
acpi_base = 0;
pm_power_off = NULL;
- return 0;
}
static struct platform_driver cs5535_pms_driver = {
@@ -160,7 +159,7 @@ static struct platform_driver cs5535_pms_driver = {
.name = "cs5535-pms",
},
.probe = xo1_pm_probe,
- .remove = xo1_pm_remove,
+ .remove_new = xo1_pm_remove,
};
static struct platform_driver cs5535_acpi_driver = {
@@ -168,7 +167,7 @@ static struct platform_driver cs5535_acpi_driver = {
.name = "olpc-xo1-pm-acpi",
},
.probe = xo1_pm_probe,
- .remove = xo1_pm_remove,
+ .remove_new = xo1_pm_remove,
};
static int __init xo1_pm_init(void)
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index 89f25af4b3c3..46d42ff6e18a 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -598,7 +598,7 @@ err_ebook:
return r;
}
-static int xo1_sci_remove(struct platform_device *pdev)
+static void xo1_sci_remove(struct platform_device *pdev)
{
free_irq(sci_irq, pdev);
cancel_work_sync(&sci_work);
@@ -608,7 +608,6 @@ static int xo1_sci_remove(struct platform_device *pdev)
free_ebook_switch();
free_power_button();
acpi_base = 0;
- return 0;
}
static struct platform_driver xo1_sci_driver = {
@@ -617,7 +616,7 @@ static struct platform_driver xo1_sci_driver = {
.dev_groups = lid_groups,
},
.probe = xo1_sci_probe,
- .remove = xo1_sci_remove,
+ .remove_new = xo1_sci_remove,
.suspend = xo1_sci_suspend,
.resume = xo1_sci_resume,
};
diff --git a/arch/x86/platform/pvh/Makefile b/arch/x86/platform/pvh/Makefile
index 5dec5067c9fb..c43fb7964dc4 100644
--- a/arch/x86/platform/pvh/Makefile
+++ b/arch/x86/platform/pvh/Makefile
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
OBJECT_FILES_NON_STANDARD_head.o := y
+KASAN_SANITIZE := n
obj-$(CONFIG_PVH) += enlighten.o
obj-$(CONFIG_PVH) += head.o
diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
index 8c2d4b8de25d..2263885d16ba 100644
--- a/arch/x86/platform/pvh/enlighten.c
+++ b/arch/x86/platform/pvh/enlighten.c
@@ -75,9 +75,6 @@ static void __init init_pvh_bootparams(bool xen_guest)
} else
xen_raw_printk("Warning: Can fit ISA range into e820\n");
- if (xen_guest)
- xen_reserve_extra_memory(&pvh_bootparams);
-
pvh_bootparams.hdr.cmd_line_ptr =
pvh_start_info.cmdline_paddr;
@@ -133,7 +130,11 @@ void __init xen_prepare_pvh(void)
BUG();
}
- memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
+ /*
+ * This must not compile to "call memset" because memset() may be
+ * instrumented.
+ */
+ __builtin_memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
hypervisor_specific_init(xen_guest);
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index f7235ef87bc3..64fca49cd88f 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -7,6 +7,7 @@
.code32
.text
#define _pa(x) ((x) - __START_KERNEL_map)
+#define rva(x) ((x) - pvh_start_xen)
#include <linux/elfnote.h>
#include <linux/init.h>
@@ -15,6 +16,7 @@
#include <asm/segment.h>
#include <asm/asm.h>
#include <asm/boot.h>
+#include <asm/pgtable.h>
#include <asm/processor-flags.h>
#include <asm/msr.h>
#include <asm/nospec-branch.h>
@@ -54,7 +56,25 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
UNWIND_HINT_END_OF_STACK
cld
- lgdt (_pa(gdt))
+ /*
+ * See the comment for startup_32 for more details. We need to
+ * execute a call to get the execution address to be position
+ * independent, but we don't have a stack. Save and restore the
+ * magic field of start_info in ebx, and use that as the stack.
+ */
+ mov (%ebx), %eax
+ leal 4(%ebx), %esp
+ ANNOTATE_INTRA_FUNCTION_CALL
+ call 1f
+1: popl %ebp
+ mov %eax, (%ebx)
+ subl $rva(1b), %ebp
+ movl $0, %esp
+
+ leal rva(gdt)(%ebp), %eax
+ leal rva(gdt_start)(%ebp), %ecx
+ movl %ecx, 2(%eax)
+ lgdt (%eax)
mov $PVH_DS_SEL,%eax
mov %eax,%ds
@@ -62,14 +82,14 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
mov %eax,%ss
/* Stash hvm_start_info. */
- mov $_pa(pvh_start_info), %edi
+ leal rva(pvh_start_info)(%ebp), %edi
mov %ebx, %esi
- mov _pa(pvh_start_info_sz), %ecx
+ movl rva(pvh_start_info_sz)(%ebp), %ecx
shr $2,%ecx
rep
movsl
- mov $_pa(early_stack_end), %esp
+ leal rva(early_stack_end)(%ebp), %esp
/* Enable PAE mode. */
mov %cr4, %eax
@@ -83,31 +103,86 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
btsl $_EFER_LME, %eax
wrmsr
+ mov %ebp, %ebx
+ subl $_pa(pvh_start_xen), %ebx /* offset */
+ jz .Lpagetable_done
+
+ /* Fixup page-tables for relocation. */
+ leal rva(pvh_init_top_pgt)(%ebp), %edi
+ movl $PTRS_PER_PGD, %ecx
+2:
+ testl $_PAGE_PRESENT, 0x00(%edi)
+ jz 1f
+ addl %ebx, 0x00(%edi)
+1:
+ addl $8, %edi
+ decl %ecx
+ jnz 2b
+
+ /* L3 ident has a single entry. */
+ leal rva(pvh_level3_ident_pgt)(%ebp), %edi
+ addl %ebx, 0x00(%edi)
+
+ leal rva(pvh_level3_kernel_pgt)(%ebp), %edi
+ addl %ebx, (PAGE_SIZE - 16)(%edi)
+ addl %ebx, (PAGE_SIZE - 8)(%edi)
+
+ /* pvh_level2_ident_pgt is fine - large pages */
+
+ /* pvh_level2_kernel_pgt needs adjustment - large pages */
+ leal rva(pvh_level2_kernel_pgt)(%ebp), %edi
+ movl $PTRS_PER_PMD, %ecx
+2:
+ testl $_PAGE_PRESENT, 0x00(%edi)
+ jz 1f
+ addl %ebx, 0x00(%edi)
+1:
+ addl $8, %edi
+ decl %ecx
+ jnz 2b
+
+.Lpagetable_done:
/* Enable pre-constructed page tables. */
- mov $_pa(init_top_pgt), %eax
+ leal rva(pvh_init_top_pgt)(%ebp), %eax
mov %eax, %cr3
mov $(X86_CR0_PG | X86_CR0_PE), %eax
mov %eax, %cr0
/* Jump to 64-bit mode. */
- ljmp $PVH_CS_SEL, $_pa(1f)
+ pushl $PVH_CS_SEL
+ leal rva(1f)(%ebp), %eax
+ pushl %eax
+ lretl
/* 64-bit entry point. */
.code64
1:
+ UNWIND_HINT_END_OF_STACK
+
/* Set base address in stack canary descriptor. */
mov $MSR_GS_BASE,%ecx
- mov $_pa(canary), %eax
+ leal canary(%rip), %eax
xor %edx, %edx
wrmsr
+ /*
+ * Calculate load offset and store in phys_base. __pa() needs
+ * phys_base set to calculate the hypercall page in xen_pvh_init().
+ */
+ movq %rbp, %rbx
+ subq $_pa(pvh_start_xen), %rbx
+ movq %rbx, phys_base(%rip)
call xen_prepare_pvh
+ /*
+ * Clear phys_base. __startup_64 will *add* to its value,
+ * so reset to 0.
+ */
+ xor %rbx, %rbx
+ movq %rbx, phys_base(%rip)
/* startup_64 expects boot_params in %rsi. */
- mov $_pa(pvh_bootparams), %rsi
- mov $_pa(startup_64), %rax
- ANNOTATE_RETPOLINE_SAFE
- jmp *%rax
+ lea pvh_bootparams(%rip), %rsi
+ jmp startup_64
#else /* CONFIG_X86_64 */
@@ -143,7 +218,7 @@ SYM_CODE_END(pvh_start_xen)
.balign 8
SYM_DATA_START_LOCAL(gdt)
.word gdt_end - gdt_start
- .long _pa(gdt_start)
+ .long _pa(gdt_start) /* x86-64 will overwrite if relocated. */
.word 0
SYM_DATA_END(gdt)
SYM_DATA_START_LOCAL(gdt_start)
@@ -163,5 +238,67 @@ SYM_DATA_START_LOCAL(early_stack)
.fill BOOT_STACK_SIZE, 1, 0
SYM_DATA_END_LABEL(early_stack, SYM_L_LOCAL, early_stack_end)
+#ifdef CONFIG_X86_64
+/*
+ * Xen PVH needs a set of identity mapped and kernel high mapping
+ * page tables. pvh_start_xen starts running on the identity mapped
+ * page tables, but xen_prepare_pvh calls into the high mapping.
+ * These page tables need to be relocatable and are only used until
+ * startup_64 transitions to init_top_pgt.
+ */
+SYM_DATA_START_PAGE_ALIGNED(pvh_init_top_pgt)
+ .quad pvh_level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org pvh_init_top_pgt + L4_PAGE_OFFSET * 8, 0
+ .quad pvh_level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org pvh_init_top_pgt + L4_START_KERNEL * 8, 0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad pvh_level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+SYM_DATA_END(pvh_init_top_pgt)
+
+SYM_DATA_START_PAGE_ALIGNED(pvh_level3_ident_pgt)
+ .quad pvh_level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .fill 511, 8, 0
+SYM_DATA_END(pvh_level3_ident_pgt)
+SYM_DATA_START_PAGE_ALIGNED(pvh_level2_ident_pgt)
+ /*
+ * Since I easily can, map the first 1G.
+ * Don't set NX because code runs from these pages.
+ *
+ * Note: This sets _PAGE_GLOBAL despite whether
+ * the CPU supports it or it is enabled. But,
+ * the CPU should ignore the bit.
+ */
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+SYM_DATA_END(pvh_level2_ident_pgt)
+SYM_DATA_START_PAGE_ALIGNED(pvh_level3_kernel_pgt)
+ .fill L3_START_KERNEL, 8, 0
+ /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
+ .quad pvh_level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .quad 0 /* no fixmap */
+SYM_DATA_END(pvh_level3_kernel_pgt)
+
+SYM_DATA_START_PAGE_ALIGNED(pvh_level2_kernel_pgt)
+ /*
+ * Kernel high mapping.
+ *
+ * The kernel code+data+bss must be located below KERNEL_IMAGE_SIZE in
+ * virtual address space, which is 1 GiB if RANDOMIZE_BASE is enabled,
+ * 512 MiB otherwise.
+ *
+ * (NOTE: after that starts the module area, see MODULES_VADDR.)
+ *
+ * This table is eventually used by the kernel during normal runtime.
+ * Care must be taken to clear out undesired bits later, like _PAGE_RW
+ * or _PAGE_GLOBAL in some cases.
+ */
+ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE / PMD_SIZE)
+SYM_DATA_END(pvh_level2_kernel_pgt)
+
+ ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_RELOC,
+ .long CONFIG_PHYSICAL_ALIGN;
+ .long LOAD_PHYSICAL_ADDR;
+ .long KERNEL_IMAGE_SIZE - 1)
+#endif
+
ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
_ASM_PTR (pvh_start_xen - __START_KERNEL_map))
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index bc31863c5ee6..ebdfd7b84feb 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-OBJECT_FILES_NON_STANDARD := y
purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o
@@ -30,19 +29,12 @@ LDFLAGS_purgatory.ro := -r $(PURGATORY_LDFLAGS)
LDFLAGS_purgatory.chk := $(PURGATORY_LDFLAGS)
targets += purgatory.ro purgatory.chk
-# Sanitizer, etc. runtimes are unavailable and cannot be linked here.
-GCOV_PROFILE := n
-KASAN_SANITIZE := n
-UBSAN_SANITIZE := n
-KCSAN_SANITIZE := n
-KMSAN_SANITIZE := n
-KCOV_INSTRUMENT := n
-
# These are adjustments to the compiler flags used for objects that
# make up the standalone purgatory.ro
PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
-PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss -g0
+PURGATORY_CFLAGS := -mcmodel=small -ffreestanding -fno-zero-initialized-in-bss -g0
+PURGATORY_CFLAGS += -fpic -fvisibility=hidden
PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
PURGATORY_CFLAGS += -fno-stack-protector
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index f614009d3e4e..a0fb39abc5c8 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -7,15 +7,6 @@
#
#
-# Sanitizer runtimes are unavailable and cannot be linked here.
-KASAN_SANITIZE := n
-KCSAN_SANITIZE := n
-KMSAN_SANITIZE := n
-OBJECT_FILES_NON_STANDARD := y
-
-# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
-KCOV_INSTRUMENT := n
-
always-y := realmode.bin realmode.relocs
wakeup-objs := wakeup_asm.o wakemain.o video-mode.o
@@ -76,5 +67,3 @@ KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \
-I$(srctree)/arch/x86/boot
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
-GCOV_PROFILE := n
-UBSAN_SANITIZE := n
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index af38469afd14..5770c8097f32 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -64,7 +64,9 @@ BEGIN {
modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
force64_expr = "\\([df]64\\)"
- rex_expr = "^REX(\\.[XRWB]+)*"
+ rex_expr = "^((REX(\\.[XRWB]+)+)|(REX$))"
+ rex2_expr = "\\(REX2\\)"
+ no_rex2_expr = "\\(!REX2\\)"
fpu_expr = "^ESC" # TODO
lprefix1_expr = "\\((66|!F3)\\)"
@@ -81,6 +83,8 @@ BEGIN {
vexonly_expr = "\\(v\\)"
# All opcodes with (ev) superscript supports *only* EVEX prefix
evexonly_expr = "\\(ev\\)"
+ # (es) is the same as (ev) but also "SCALABLE" i.e. W and pp determine operand size
+ evex_scalable_expr = "\\(es\\)"
prefix_expr = "\\(Prefix\\)"
prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
@@ -99,6 +103,7 @@ BEGIN {
prefix_num["VEX+1byte"] = "INAT_PFX_VEX2"
prefix_num["VEX+2byte"] = "INAT_PFX_VEX3"
prefix_num["EVEX"] = "INAT_PFX_EVEX"
+ prefix_num["REX2"] = "INAT_PFX_REX2"
clear_vars()
}
@@ -314,6 +319,10 @@ function convert_operands(count,opnd, i,j,imm,mod)
if (match(ext, force64_expr))
flags = add_flags(flags, "INAT_FORCE64")
+ # check REX2 not allowed
+ if (match(ext, no_rex2_expr))
+ flags = add_flags(flags, "INAT_NO_REX2")
+
# check REX prefix
if (match(opcode, rex_expr))
flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)")
@@ -325,6 +334,8 @@ function convert_operands(count,opnd, i,j,imm,mod)
# check VEX codes
if (match(ext, evexonly_expr))
flags = add_flags(flags, "INAT_VEXOK | INAT_EVEXONLY")
+ else if (match(ext, evex_scalable_expr))
+ flags = add_flags(flags, "INAT_VEXOK | INAT_EVEXONLY | INAT_EVEX_SCALABLE")
else if (match(ext, vexonly_expr))
flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr))
@@ -351,6 +362,8 @@ function convert_operands(count,opnd, i,j,imm,mod)
lptable3[idx] = add_flags(lptable3[idx],flags)
variant = "INAT_VARIANT"
}
+ if (match(ext, rex2_expr))
+ table[idx] = add_flags(table[idx], "INAT_REX2_VARIANT")
if (!match(ext, lprefix_expr)){
table[idx] = add_flags(table[idx],flags)
}
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index b029fb81ebee..c101bed61940 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -11,41 +11,42 @@
#define Elf_Shdr ElfW(Shdr)
#define Elf_Sym ElfW(Sym)
-static Elf_Ehdr ehdr;
-static unsigned long shnum;
-static unsigned int shstrndx;
-static unsigned int shsymtabndx;
-static unsigned int shxsymtabndx;
+static Elf_Ehdr ehdr;
+static unsigned long shnum;
+static unsigned int shstrndx;
+static unsigned int shsymtabndx;
+static unsigned int shxsymtabndx;
static int sym_index(Elf_Sym *sym);
struct relocs {
- uint32_t *offset;
- unsigned long count;
- unsigned long size;
+ uint32_t *offset;
+ unsigned long count;
+ unsigned long size;
};
-static struct relocs relocs16;
-static struct relocs relocs32;
+static struct relocs relocs16;
+static struct relocs relocs32;
+
#if ELF_BITS == 64
-static struct relocs relocs32neg;
-static struct relocs relocs64;
-#define FMT PRIu64
+static struct relocs relocs32neg;
+static struct relocs relocs64;
+# define FMT PRIu64
#else
-#define FMT PRIu32
+# define FMT PRIu32
#endif
struct section {
- Elf_Shdr shdr;
- struct section *link;
- Elf_Sym *symtab;
- Elf32_Word *xsymtab;
- Elf_Rel *reltab;
- char *strtab;
+ Elf_Shdr shdr;
+ struct section *link;
+ Elf_Sym *symtab;
+ Elf32_Word *xsymtab;
+ Elf_Rel *reltab;
+ char *strtab;
};
-static struct section *secs;
+static struct section *secs;
-static const char * const sym_regex_kernel[S_NSYMTYPES] = {
+static const char * const sym_regex_kernel[S_NSYMTYPES] = {
/*
* Following symbols have been audited. There values are constant and do
* not change if bzImage is loaded at a different physical address than
@@ -115,13 +116,13 @@ static const char * const sym_regex_realmode[S_NSYMTYPES] = {
"^pa_",
};
-static const char * const *sym_regex;
+static const char * const *sym_regex;
+
+static regex_t sym_regex_c[S_NSYMTYPES];
-static regex_t sym_regex_c[S_NSYMTYPES];
static int is_reloc(enum symtype type, const char *sym_name)
{
- return sym_regex[type] &&
- !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0);
+ return sym_regex[type] && !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0);
}
static void regex_init(int use_real_mode)
@@ -139,8 +140,7 @@ static void regex_init(int use_real_mode)
if (!sym_regex[i])
continue;
- err = regcomp(&sym_regex_c[i], sym_regex[i],
- REG_EXTENDED|REG_NOSUB);
+ err = regcomp(&sym_regex_c[i], sym_regex[i], REG_EXTENDED|REG_NOSUB);
if (err) {
regerror(err, &sym_regex_c[i], errbuf, sizeof(errbuf));
@@ -163,9 +163,10 @@ static const char *sym_type(unsigned type)
#undef SYM_TYPE
};
const char *name = "unknown sym type name";
- if (type < ARRAY_SIZE(type_name)) {
+
+ if (type < ARRAY_SIZE(type_name))
name = type_name[type];
- }
+
return name;
}
@@ -179,9 +180,10 @@ static const char *sym_bind(unsigned bind)
#undef SYM_BIND
};
const char *name = "unknown sym bind name";
- if (bind < ARRAY_SIZE(bind_name)) {
+
+ if (bind < ARRAY_SIZE(bind_name))
name = bind_name[bind];
- }
+
return name;
}
@@ -196,9 +198,10 @@ static const char *sym_visibility(unsigned visibility)
#undef SYM_VISIBILITY
};
const char *name = "unknown sym visibility name";
- if (visibility < ARRAY_SIZE(visibility_name)) {
+
+ if (visibility < ARRAY_SIZE(visibility_name))
name = visibility_name[visibility];
- }
+
return name;
}
@@ -244,9 +247,10 @@ static const char *rel_type(unsigned type)
#undef REL_TYPE
};
const char *name = "unknown type rel type name";
- if (type < ARRAY_SIZE(type_name) && type_name[type]) {
+
+ if (type < ARRAY_SIZE(type_name) && type_name[type])
name = type_name[type];
- }
+
return name;
}
@@ -256,15 +260,14 @@ static const char *sec_name(unsigned shndx)
const char *name;
sec_strtab = secs[shstrndx].strtab;
name = "<noname>";
- if (shndx < shnum) {
+
+ if (shndx < shnum)
name = sec_strtab + secs[shndx].shdr.sh_name;
- }
- else if (shndx == SHN_ABS) {
+ else if (shndx == SHN_ABS)
name = "ABSOLUTE";
- }
- else if (shndx == SHN_COMMON) {
+ else if (shndx == SHN_COMMON)
name = "COMMON";
- }
+
return name;
}
@@ -272,18 +275,19 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym)
{
const char *name;
name = "<noname>";
- if (sym->st_name) {
+
+ if (sym->st_name)
name = sym_strtab + sym->st_name;
- }
- else {
+ else
name = sec_name(sym_index(sym));
- }
+
return name;
}
static Elf_Sym *sym_lookup(const char *symname)
{
int i;
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
long nsyms;
@@ -309,14 +313,15 @@ static Elf_Sym *sym_lookup(const char *symname)
}
#if BYTE_ORDER == LITTLE_ENDIAN
-#define le16_to_cpu(val) (val)
-#define le32_to_cpu(val) (val)
-#define le64_to_cpu(val) (val)
+# define le16_to_cpu(val) (val)
+# define le32_to_cpu(val) (val)
+# define le64_to_cpu(val) (val)
#endif
+
#if BYTE_ORDER == BIG_ENDIAN
-#define le16_to_cpu(val) bswap_16(val)
-#define le32_to_cpu(val) bswap_32(val)
-#define le64_to_cpu(val) bswap_64(val)
+# define le16_to_cpu(val) bswap_16(val)
+# define le32_to_cpu(val) bswap_32(val)
+# define le64_to_cpu(val) bswap_64(val)
#endif
static uint16_t elf16_to_cpu(uint16_t val)
@@ -337,13 +342,13 @@ static uint64_t elf64_to_cpu(uint64_t val)
{
return le64_to_cpu(val);
}
-#define elf_addr_to_cpu(x) elf64_to_cpu(x)
-#define elf_off_to_cpu(x) elf64_to_cpu(x)
-#define elf_xword_to_cpu(x) elf64_to_cpu(x)
+# define elf_addr_to_cpu(x) elf64_to_cpu(x)
+# define elf_off_to_cpu(x) elf64_to_cpu(x)
+# define elf_xword_to_cpu(x) elf64_to_cpu(x)
#else
-#define elf_addr_to_cpu(x) elf32_to_cpu(x)
-#define elf_off_to_cpu(x) elf32_to_cpu(x)
-#define elf_xword_to_cpu(x) elf32_to_cpu(x)
+# define elf_addr_to_cpu(x) elf32_to_cpu(x)
+# define elf_off_to_cpu(x) elf32_to_cpu(x)
+# define elf_xword_to_cpu(x) elf32_to_cpu(x)
#endif
static int sym_index(Elf_Sym *sym)
@@ -365,22 +370,17 @@ static int sym_index(Elf_Sym *sym)
static void read_ehdr(FILE *fp)
{
- if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
- die("Cannot read ELF header: %s\n",
- strerror(errno));
- }
- if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) {
+ if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1)
+ die("Cannot read ELF header: %s\n", strerror(errno));
+ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0)
die("No ELF magic\n");
- }
- if (ehdr.e_ident[EI_CLASS] != ELF_CLASS) {
+ if (ehdr.e_ident[EI_CLASS] != ELF_CLASS)
die("Not a %d bit executable\n", ELF_BITS);
- }
- if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
+ if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB)
die("Not a LSB ELF executable\n");
- }
- if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
+ if (ehdr.e_ident[EI_VERSION] != EV_CURRENT)
die("Unknown ELF version\n");
- }
+
/* Convert the fields to native endian */
ehdr.e_type = elf_half_to_cpu(ehdr.e_type);
ehdr.e_machine = elf_half_to_cpu(ehdr.e_machine);
@@ -439,19 +439,18 @@ static void read_shdrs(FILE *fp)
Elf_Shdr shdr;
secs = calloc(shnum, sizeof(struct section));
- if (!secs) {
- die("Unable to allocate %ld section headers\n",
- shnum);
- }
- if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- ehdr.e_shoff, strerror(errno));
- }
+ if (!secs)
+ die("Unable to allocate %ld section headers\n", shnum);
+
+ if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", ehdr.e_shoff, strerror(errno));
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
+
if (fread(&shdr, sizeof(shdr), 1, fp) != 1)
- die("Cannot read ELF section headers %d/%ld: %s\n",
- i, shnum, strerror(errno));
+ die("Cannot read ELF section headers %d/%ld: %s\n", i, shnum, strerror(errno));
+
sec->shdr.sh_name = elf_word_to_cpu(shdr.sh_name);
sec->shdr.sh_type = elf_word_to_cpu(shdr.sh_type);
sec->shdr.sh_flags = elf_xword_to_cpu(shdr.sh_flags);
@@ -471,31 +470,28 @@ static void read_shdrs(FILE *fp)
static void read_strtabs(FILE *fp)
{
int i;
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_STRTAB) {
+
+ if (sec->shdr.sh_type != SHT_STRTAB)
continue;
- }
+
sec->strtab = malloc(sec->shdr.sh_size);
- if (!sec->strtab) {
- die("malloc of %" FMT " bytes for strtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->strtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
+ if (!sec->strtab)
+ die("malloc of %" FMT " bytes for strtab failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read symbol table: %s\n", strerror(errno));
}
}
static void read_symtabs(FILE *fp)
{
- int i,j;
+ int i, j;
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
@@ -504,19 +500,15 @@ static void read_symtabs(FILE *fp)
switch (sec->shdr.sh_type) {
case SHT_SYMTAB_SHNDX:
sec->xsymtab = malloc(sec->shdr.sh_size);
- if (!sec->xsymtab) {
- die("malloc of %" FMT " bytes for xsymtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->xsymtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read extended symbol table: %s\n",
- strerror(errno));
- }
+ if (!sec->xsymtab)
+ die("malloc of %" FMT " bytes for xsymtab failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->xsymtab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read extended symbol table: %s\n", strerror(errno));
+
shxsymtabndx = i;
continue;
@@ -524,19 +516,15 @@ static void read_symtabs(FILE *fp)
num_syms = sec->shdr.sh_size / sizeof(Elf_Sym);
sec->symtab = malloc(sec->shdr.sh_size);
- if (!sec->symtab) {
- die("malloc of %" FMT " bytes for symtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
+ if (!sec->symtab)
+ die("malloc of %" FMT " bytes for symtab failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read symbol table: %s\n", strerror(errno));
+
for (j = 0; j < num_syms; j++) {
Elf_Sym *sym = &sec->symtab[j];
@@ -557,28 +545,27 @@ static void read_symtabs(FILE *fp)
static void read_relocs(FILE *fp)
{
- int i,j;
+ int i, j;
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_REL_TYPE) {
+
+ if (sec->shdr.sh_type != SHT_REL_TYPE)
continue;
- }
+
sec->reltab = malloc(sec->shdr.sh_size);
- if (!sec->reltab) {
- die("malloc of %" FMT " bytes for relocs failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->reltab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
+ if (!sec->reltab)
+ die("malloc of %" FMT " bytes for relocs failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read symbol table: %s\n", strerror(errno));
+
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
Elf_Rel *rel = &sec->reltab[j];
+
rel->r_offset = elf_addr_to_cpu(rel->r_offset);
rel->r_info = elf_xword_to_cpu(rel->r_info);
#if (SHT_REL_TYPE == SHT_RELA)
@@ -601,23 +588,27 @@ static void print_absolute_symbols(void)
printf("Absolute symbols\n");
printf(" Num: Value Size Type Bind Visibility Name\n");
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
char *sym_strtab;
int j;
- if (sec->shdr.sh_type != SHT_SYMTAB) {
+ if (sec->shdr.sh_type != SHT_SYMTAB)
continue;
- }
+
sym_strtab = sec->link->strtab;
+
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Sym); j++) {
Elf_Sym *sym;
const char *name;
+
sym = &sec->symtab[j];
name = sym_name(sym_strtab, sym);
- if (sym->st_shndx != SHN_ABS) {
+
+ if (sym->st_shndx != SHN_ABS)
continue;
- }
+
printf(format,
j, sym->st_value, sym->st_size,
sym_type(ELF_ST_TYPE(sym->st_info)),
@@ -645,34 +636,37 @@ static void print_absolute_relocs(void)
char *sym_strtab;
Elf_Sym *sh_symtab;
int j;
- if (sec->shdr.sh_type != SHT_REL_TYPE) {
+
+ if (sec->shdr.sh_type != SHT_REL_TYPE)
continue;
- }
+
sec_symtab = sec->link;
sec_applies = &secs[sec->shdr.sh_info];
- if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
+ if (!(sec_applies->shdr.sh_flags & SHF_ALLOC))
continue;
- }
+
/*
* Do not perform relocations in .notes section; any
* values there are meant for pre-boot consumption (e.g.
* startup_xen).
*/
- if (sec_applies->shdr.sh_type == SHT_NOTE) {
+ if (sec_applies->shdr.sh_type == SHT_NOTE)
continue;
- }
+
sh_symtab = sec_symtab->symtab;
sym_strtab = sec_symtab->link->strtab;
+
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
Elf_Rel *rel;
Elf_Sym *sym;
const char *name;
+
rel = &sec->reltab[j];
sym = &sh_symtab[ELF_R_SYM(rel->r_info)];
name = sym_name(sym_strtab, sym);
- if (sym->st_shndx != SHN_ABS) {
+
+ if (sym->st_shndx != SHN_ABS)
continue;
- }
/* Absolute symbols are not relocated if bzImage is
* loaded at a non-compiled address. Display a warning
@@ -691,10 +685,8 @@ static void print_absolute_relocs(void)
continue;
if (!printed) {
- printf("WARNING: Absolute relocations"
- " present\n");
- printf("Offset Info Type Sym.Value "
- "Sym.Name\n");
+ printf("WARNING: Absolute relocations present\n");
+ printf("Offset Info Type Sym.Value Sym.Name\n");
printed = 1;
}
@@ -718,8 +710,8 @@ static void add_reloc(struct relocs *r, uint32_t offset)
void *mem = realloc(r->offset, newsize * sizeof(r->offset[0]));
if (!mem)
- die("realloc of %ld entries for relocs failed\n",
- newsize);
+ die("realloc of %ld entries for relocs failed\n", newsize);
+
r->offset = mem;
r->size = newsize;
}
@@ -730,6 +722,7 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
Elf_Sym *sym, const char *symname))
{
int i;
+
/* Walk through the relocations */
for (i = 0; i < shnum; i++) {
char *sym_strtab;
@@ -738,16 +731,25 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
int j;
struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_REL_TYPE) {
+ if (sec->shdr.sh_type != SHT_REL_TYPE)
continue;
- }
+
sec_symtab = sec->link;
sec_applies = &secs[sec->shdr.sh_info];
- if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
+ if (!(sec_applies->shdr.sh_flags & SHF_ALLOC))
continue;
- }
+
+ /*
+ * Do not perform relocations in .notes sections; any
+ * values there are meant for pre-boot consumption (e.g.
+ * startup_xen).
+ */
+ if (sec_applies->shdr.sh_type == SHT_NOTE)
+ continue;
+
sh_symtab = sec_symtab->symtab;
sym_strtab = sec_symtab->link->strtab;
+
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
Elf_Rel *rel = &sec->reltab[j];
Elf_Sym *sym = &sh_symtab[ELF_R_SYM(rel->r_info)];
@@ -781,14 +783,16 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
* kernel data and does not require special treatment.
*
*/
-static int per_cpu_shndx = -1;
+static int per_cpu_shndx = -1;
static Elf_Addr per_cpu_load_addr;
static void percpu_init(void)
{
int i;
+
for (i = 0; i < shnum; i++) {
ElfW(Sym) *sym;
+
if (strcmp(sec_name(i), ".data..percpu"))
continue;
@@ -801,6 +805,7 @@ static void percpu_init(void)
per_cpu_shndx = i;
per_cpu_load_addr = sym->st_value;
+
return;
}
}
@@ -871,8 +876,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
* Only used by jump labels
*/
if (is_percpu_sym(sym, symname))
- die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n",
- symname);
+ die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n", symname);
break;
case R_X86_64_32:
@@ -892,8 +896,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
if (is_reloc(S_ABS, symname))
break;
- die("Invalid absolute %s relocation: %s\n",
- rel_type(r_type), symname);
+ die("Invalid absolute %s relocation: %s\n", rel_type(r_type), symname);
break;
}
@@ -913,8 +916,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
break;
default:
- die("Unsupported relocation type: %s (%d)\n",
- rel_type(r_type), r_type);
+ die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type);
break;
}
@@ -951,8 +953,7 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
if (is_reloc(S_ABS, symname))
break;
- die("Invalid absolute %s relocation: %s\n",
- rel_type(r_type), symname);
+ die("Invalid absolute %s relocation: %s\n", rel_type(r_type), symname);
break;
}
@@ -960,16 +961,14 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
break;
default:
- die("Unsupported relocation type: %s (%d)\n",
- rel_type(r_type), r_type);
+ die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type);
break;
}
return 0;
}
-static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
- const char *symname)
+static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname)
{
unsigned r_type = ELF32_R_TYPE(rel->r_info);
int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname);
@@ -1004,9 +1003,7 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
if (!is_reloc(S_LIN, symname))
break;
}
- die("Invalid %s %s relocation: %s\n",
- shn_abs ? "absolute" : "relative",
- rel_type(r_type), symname);
+ die("Invalid %s %s relocation: %s\n", shn_abs ? "absolute" : "relative", rel_type(r_type), symname);
break;
case R_386_32:
@@ -1027,14 +1024,11 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
add_reloc(&relocs32, rel->r_offset);
break;
}
- die("Invalid %s %s relocation: %s\n",
- shn_abs ? "absolute" : "relative",
- rel_type(r_type), symname);
+ die("Invalid %s %s relocation: %s\n", shn_abs ? "absolute" : "relative", rel_type(r_type), symname);
break;
default:
- die("Unsupported relocation type: %s (%d)\n",
- rel_type(r_type), r_type);
+ die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type);
break;
}
@@ -1046,7 +1040,10 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
static int cmp_relocs(const void *va, const void *vb)
{
const uint32_t *a, *b;
- a = va; b = vb;
+
+ a = va;
+ b = vb;
+
return (*a == *b)? 0 : (*a > *b)? 1 : -1;
}
@@ -1060,6 +1057,7 @@ static int write32(uint32_t v, FILE *f)
unsigned char buf[4];
put_unaligned_le32(v, buf);
+
return fwrite(buf, 1, 4, f) == 4 ? 0 : -1;
}
@@ -1072,8 +1070,7 @@ static void emit_relocs(int as_text, int use_real_mode)
{
int i;
int (*write_reloc)(uint32_t, FILE *) = write32;
- int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
- const char *symname);
+ int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname);
#if ELF_BITS == 64
if (!use_real_mode)
@@ -1160,6 +1157,7 @@ static int do_reloc_info(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
rel_type(ELF_R_TYPE(rel->r_info)),
symname,
sec_name(sym_index(sym)));
+
return 0;
}
@@ -1185,19 +1183,24 @@ void process(FILE *fp, int use_real_mode, int as_text,
read_strtabs(fp);
read_symtabs(fp);
read_relocs(fp);
+
if (ELF_BITS == 64)
percpu_init();
+
if (show_absolute_syms) {
print_absolute_symbols();
return;
}
+
if (show_absolute_relocs) {
print_absolute_relocs();
return;
}
+
if (show_reloc_info) {
print_reloc_info();
return;
}
+
emit_relocs(as_text, use_real_mode);
}
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 8bc72a51b257..36e67fc97c22 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -9,9 +9,9 @@ else
BITS := 64
endif
-obj-y = bugs_$(BITS).o delay.o fault.o ldt.o \
+obj-y = bugs_$(BITS).o delay.o fault.o \
ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \
- stub_$(BITS).o stub_segv.o \
+ stub_segv.o \
sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \
mem_$(BITS).o subarch.o os-Linux/
@@ -31,7 +31,6 @@ obj-y += syscalls_64.o vdso/
subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o \
../lib/memmove_64.o ../lib/memset_64.o
-subarch-$(CONFIG_PREEMPTION) += ../entry/thunk_64.o
endif
diff --git a/arch/x86/um/asm/mm_context.h b/arch/x86/um/asm/mm_context.h
deleted file mode 100644
index dc32dc023c2f..000000000000
--- a/arch/x86/um/asm/mm_context.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2004 Fujitsu Siemens Computers GmbH
- * Licensed under the GPL
- *
- * Author: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
- */
-
-#ifndef __ASM_LDT_H
-#define __ASM_LDT_H
-
-#include <linux/mutex.h>
-#include <asm/ldt.h>
-
-#define LDT_PAGES_MAX \
- ((LDT_ENTRIES * LDT_ENTRY_SIZE)/PAGE_SIZE)
-#define LDT_ENTRIES_PER_PAGE \
- (PAGE_SIZE/LDT_ENTRY_SIZE)
-#define LDT_DIRECT_ENTRIES \
- ((LDT_PAGES_MAX*sizeof(void *))/LDT_ENTRY_SIZE)
-
-struct ldt_entry {
- __u32 a;
- __u32 b;
-};
-
-typedef struct uml_ldt {
- int entry_count;
- struct mutex lock;
- union {
- struct ldt_entry * pages[LDT_PAGES_MAX];
- struct ldt_entry entries[LDT_DIRECT_ENTRIES];
- } u;
-} uml_ldt_t;
-
-#define LDT_entry_a(info) \
- ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
-
-#define LDT_entry_b(info) \
- (((info)->base_addr & 0xff000000) | \
- (((info)->base_addr & 0x00ff0000) >> 16) | \
- ((info)->limit & 0xf0000) | \
- (((info)->read_exec_only ^ 1) << 9) | \
- ((info)->contents << 10) | \
- (((info)->seg_not_present ^ 1) << 15) | \
- ((info)->seg_32bit << 22) | \
- ((info)->limit_in_pages << 23) | \
- ((info)->useable << 20) | \
- 0x7000)
-
-#define _LDT_empty(info) (\
- (info)->base_addr == 0 && \
- (info)->limit == 0 && \
- (info)->contents == 0 && \
- (info)->read_exec_only == 1 && \
- (info)->seg_32bit == 0 && \
- (info)->limit_in_pages == 0 && \
- (info)->seg_not_present == 1 && \
- (info)->useable == 0 )
-
-#ifdef CONFIG_X86_64
-#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
-#else
-#define LDT_empty(info) (_LDT_empty(info))
-#endif
-
-struct uml_arch_mm_context {
- uml_ldt_t ldt;
-};
-
-#endif
diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h
index 83822fd42204..2fef3da55533 100644
--- a/arch/x86/um/asm/ptrace.h
+++ b/arch/x86/um/asm/ptrace.h
@@ -54,6 +54,8 @@ extern int ptrace_get_thread_area(struct task_struct *child, int idx,
extern int ptrace_set_thread_area(struct task_struct *child, int idx,
struct user_desc __user *user_desc);
+extern int arch_switch_tls(struct task_struct *to);
+
#else
#define PT_REGS_R8(r) UPT_R8(&(r)->regs)
@@ -83,5 +85,9 @@ extern long arch_prctl(struct task_struct *task, int option,
unsigned long __user *addr);
#endif
+
#define user_stack_pointer(regs) PT_REGS_SP(regs)
+
+extern void arch_switch_to(struct task_struct *to);
+
#endif /* __UM_X86_PTRACE_H */
diff --git a/arch/x86/um/bugs_32.c b/arch/x86/um/bugs_32.c
index 33daff4dade4..d29929efcc07 100644
--- a/arch/x86/um/bugs_32.c
+++ b/arch/x86/um/bugs_32.c
@@ -3,6 +3,7 @@
* Licensed under the GPL
*/
+#include <arch.h>
#include <signal.h>
#include <kern_util.h>
#include <longjmp.h>
diff --git a/arch/x86/um/bugs_64.c b/arch/x86/um/bugs_64.c
index 8cc8256c698d..b01295e8a676 100644
--- a/arch/x86/um/bugs_64.c
+++ b/arch/x86/um/bugs_64.c
@@ -4,6 +4,7 @@
* Licensed under the GPL
*/
+#include <arch.h>
#include <sysdep/ptrace.h>
void arch_check_bugs(void)
diff --git a/arch/x86/um/elfcore.c b/arch/x86/um/elfcore.c
index 650cdbbdaf45..ef50662fc40d 100644
--- a/arch/x86/um/elfcore.c
+++ b/arch/x86/um/elfcore.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/elf.h>
+#include <linux/elfcore.h>
#include <linux/coredump.h>
#include <linux/fs.h>
#include <linux/mm.h>
diff --git a/arch/x86/um/fault.c b/arch/x86/um/fault.c
index 84ac7f7b0257..0dde4d613a87 100644
--- a/arch/x86/um/fault.c
+++ b/arch/x86/um/fault.c
@@ -3,6 +3,7 @@
* Licensed under the GPL
*/
+#include <arch.h>
#include <sysdep/ptrace.h>
/* These two are from asm-um/uaccess.h and linux/module.h, check them. */
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
deleted file mode 100644
index 255a44dd415a..000000000000
--- a/arch/x86/um/ldt.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/syscalls.h>
-#include <linux/uaccess.h>
-#include <asm/unistd.h>
-#include <os.h>
-#include <skas.h>
-#include <sysdep/tls.h>
-
-static inline int modify_ldt (int func, void *ptr, unsigned long bytecount)
-{
- return syscall(__NR_modify_ldt, func, ptr, bytecount);
-}
-
-static long write_ldt_entry(struct mm_id *mm_idp, int func,
- struct user_desc *desc, void **addr, int done)
-{
- long res;
- void *stub_addr;
-
- BUILD_BUG_ON(sizeof(*desc) % sizeof(long));
-
- res = syscall_stub_data(mm_idp, (unsigned long *)desc,
- sizeof(*desc) / sizeof(long),
- addr, &stub_addr);
- if (!res) {
- unsigned long args[] = { func,
- (unsigned long)stub_addr,
- sizeof(*desc),
- 0, 0, 0 };
- res = run_syscall_stub(mm_idp, __NR_modify_ldt, args,
- 0, addr, done);
- }
-
- return res;
-}
-
-/*
- * In skas mode, we hold our own ldt data in UML.
- * Thus, the code implementing sys_modify_ldt_skas
- * is very similar to (and mostly stolen from) sys_modify_ldt
- * for arch/i386/kernel/ldt.c
- * The routines copied and modified in part are:
- * - read_ldt
- * - read_default_ldt
- * - write_ldt
- * - sys_modify_ldt_skas
- */
-
-static int read_ldt(void __user * ptr, unsigned long bytecount)
-{
- int i, err = 0;
- unsigned long size;
- uml_ldt_t *ldt = &current->mm->context.arch.ldt;
-
- if (!ldt->entry_count)
- goto out;
- if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
- bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
- err = bytecount;
-
- mutex_lock(&ldt->lock);
- if (ldt->entry_count <= LDT_DIRECT_ENTRIES) {
- size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES;
- if (size > bytecount)
- size = bytecount;
- if (copy_to_user(ptr, ldt->u.entries, size))
- err = -EFAULT;
- bytecount -= size;
- ptr += size;
- }
- else {
- for (i=0; i<ldt->entry_count/LDT_ENTRIES_PER_PAGE && bytecount;
- i++) {
- size = PAGE_SIZE;
- if (size > bytecount)
- size = bytecount;
- if (copy_to_user(ptr, ldt->u.pages[i], size)) {
- err = -EFAULT;
- break;
- }
- bytecount -= size;
- ptr += size;
- }
- }
- mutex_unlock(&ldt->lock);
-
- if (bytecount == 0 || err == -EFAULT)
- goto out;
-
- if (clear_user(ptr, bytecount))
- err = -EFAULT;
-
-out:
- return err;
-}
-
-static int read_default_ldt(void __user * ptr, unsigned long bytecount)
-{
- int err;
-
- if (bytecount > 5*LDT_ENTRY_SIZE)
- bytecount = 5*LDT_ENTRY_SIZE;
-
- err = bytecount;
- /*
- * UML doesn't support lcall7 and lcall27.
- * So, we don't really have a default ldt, but emulate
- * an empty ldt of common host default ldt size.
- */
- if (clear_user(ptr, bytecount))
- err = -EFAULT;
-
- return err;
-}
-
-static int write_ldt(void __user * ptr, unsigned long bytecount, int func)
-{
- uml_ldt_t *ldt = &current->mm->context.arch.ldt;
- struct mm_id * mm_idp = &current->mm->context.id;
- int i, err;
- struct user_desc ldt_info;
- struct ldt_entry entry0, *ldt_p;
- void *addr = NULL;
-
- err = -EINVAL;
- if (bytecount != sizeof(ldt_info))
- goto out;
- err = -EFAULT;
- if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
- goto out;
-
- err = -EINVAL;
- if (ldt_info.entry_number >= LDT_ENTRIES)
- goto out;
- if (ldt_info.contents == 3) {
- if (func == 1)
- goto out;
- if (ldt_info.seg_not_present == 0)
- goto out;
- }
-
- mutex_lock(&ldt->lock);
-
- err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1);
- if (err)
- goto out_unlock;
-
- if (ldt_info.entry_number >= ldt->entry_count &&
- ldt_info.entry_number >= LDT_DIRECT_ENTRIES) {
- for (i=ldt->entry_count/LDT_ENTRIES_PER_PAGE;
- i*LDT_ENTRIES_PER_PAGE <= ldt_info.entry_number;
- i++) {
- if (i == 0)
- memcpy(&entry0, ldt->u.entries,
- sizeof(entry0));
- ldt->u.pages[i] = (struct ldt_entry *)
- __get_free_page(GFP_KERNEL|__GFP_ZERO);
- if (!ldt->u.pages[i]) {
- err = -ENOMEM;
- /* Undo the change in host */
- memset(&ldt_info, 0, sizeof(ldt_info));
- write_ldt_entry(mm_idp, 1, &ldt_info, &addr, 1);
- goto out_unlock;
- }
- if (i == 0) {
- memcpy(ldt->u.pages[0], &entry0,
- sizeof(entry0));
- memcpy(ldt->u.pages[0]+1, ldt->u.entries+1,
- sizeof(entry0)*(LDT_DIRECT_ENTRIES-1));
- }
- ldt->entry_count = (i + 1) * LDT_ENTRIES_PER_PAGE;
- }
- }
- if (ldt->entry_count <= ldt_info.entry_number)
- ldt->entry_count = ldt_info.entry_number + 1;
-
- if (ldt->entry_count <= LDT_DIRECT_ENTRIES)
- ldt_p = ldt->u.entries + ldt_info.entry_number;
- else
- ldt_p = ldt->u.pages[ldt_info.entry_number/LDT_ENTRIES_PER_PAGE] +
- ldt_info.entry_number%LDT_ENTRIES_PER_PAGE;
-
- if (ldt_info.base_addr == 0 && ldt_info.limit == 0 &&
- (func == 1 || LDT_empty(&ldt_info))) {
- ldt_p->a = 0;
- ldt_p->b = 0;
- }
- else{
- if (func == 1)
- ldt_info.useable = 0;
- ldt_p->a = LDT_entry_a(&ldt_info);
- ldt_p->b = LDT_entry_b(&ldt_info);
- }
- err = 0;
-
-out_unlock:
- mutex_unlock(&ldt->lock);
-out:
- return err;
-}
-
-static long do_modify_ldt_skas(int func, void __user *ptr,
- unsigned long bytecount)
-{
- int ret = -ENOSYS;
-
- switch (func) {
- case 0:
- ret = read_ldt(ptr, bytecount);
- break;
- case 1:
- case 0x11:
- ret = write_ldt(ptr, bytecount, func);
- break;
- case 2:
- ret = read_default_ldt(ptr, bytecount);
- break;
- }
- return ret;
-}
-
-static DEFINE_SPINLOCK(host_ldt_lock);
-static short dummy_list[9] = {0, -1};
-static short * host_ldt_entries = NULL;
-
-static void ldt_get_host_info(void)
-{
- long ret;
- struct ldt_entry * ldt;
- short *tmp;
- int i, size, k, order;
-
- spin_lock(&host_ldt_lock);
-
- if (host_ldt_entries != NULL) {
- spin_unlock(&host_ldt_lock);
- return;
- }
- host_ldt_entries = dummy_list+1;
-
- spin_unlock(&host_ldt_lock);
-
- for (i = LDT_PAGES_MAX-1, order=0; i; i>>=1, order++)
- ;
-
- ldt = (struct ldt_entry *)
- __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
- if (ldt == NULL) {
- printk(KERN_ERR "ldt_get_host_info: couldn't allocate buffer "
- "for host ldt\n");
- return;
- }
-
- ret = modify_ldt(0, ldt, (1<<order)*PAGE_SIZE);
- if (ret < 0) {
- printk(KERN_ERR "ldt_get_host_info: couldn't read host ldt\n");
- goto out_free;
- }
- if (ret == 0) {
- /* default_ldt is active, simply write an empty entry 0 */
- host_ldt_entries = dummy_list;
- goto out_free;
- }
-
- for (i=0, size=0; i<ret/LDT_ENTRY_SIZE; i++) {
- if (ldt[i].a != 0 || ldt[i].b != 0)
- size++;
- }
-
- if (size < ARRAY_SIZE(dummy_list))
- host_ldt_entries = dummy_list;
- else {
- size = (size + 1) * sizeof(dummy_list[0]);
- tmp = kmalloc(size, GFP_KERNEL);
- if (tmp == NULL) {
- printk(KERN_ERR "ldt_get_host_info: couldn't allocate "
- "host ldt list\n");
- goto out_free;
- }
- host_ldt_entries = tmp;
- }
-
- for (i=0, k=0; i<ret/LDT_ENTRY_SIZE; i++) {
- if (ldt[i].a != 0 || ldt[i].b != 0)
- host_ldt_entries[k++] = i;
- }
- host_ldt_entries[k] = -1;
-
-out_free:
- free_pages((unsigned long)ldt, order);
-}
-
-long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
-{
- struct user_desc desc;
- short * num_p;
- int i;
- long page, err=0;
- void *addr = NULL;
-
-
- mutex_init(&new_mm->arch.ldt.lock);
-
- if (!from_mm) {
- memset(&desc, 0, sizeof(desc));
- /*
- * Now we try to retrieve info about the ldt, we
- * inherited from the host. All ldt-entries found
- * will be reset in the following loop
- */
- ldt_get_host_info();
- for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
- desc.entry_number = *num_p;
- err = write_ldt_entry(&new_mm->id, 1, &desc,
- &addr, *(num_p + 1) == -1);
- if (err)
- break;
- }
- new_mm->arch.ldt.entry_count = 0;
-
- goto out;
- }
-
- /*
- * Our local LDT is used to supply the data for
- * modify_ldt(READLDT), if PTRACE_LDT isn't available,
- * i.e., we have to use the stub for modify_ldt, which
- * can't handle the big read buffer of up to 64kB.
- */
- mutex_lock(&from_mm->arch.ldt.lock);
- if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES)
- memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
- sizeof(new_mm->arch.ldt.u.entries));
- else {
- i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
- while (i-->0) {
- page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
- if (!page) {
- err = -ENOMEM;
- break;
- }
- new_mm->arch.ldt.u.pages[i] =
- (struct ldt_entry *) page;
- memcpy(new_mm->arch.ldt.u.pages[i],
- from_mm->arch.ldt.u.pages[i], PAGE_SIZE);
- }
- }
- new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
- mutex_unlock(&from_mm->arch.ldt.lock);
-
- out:
- return err;
-}
-
-
-void free_ldt(struct mm_context *mm)
-{
- int i;
-
- if (mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) {
- i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
- while (i-- > 0)
- free_page((long) mm->arch.ldt.u.pages[i]);
- }
- mm->arch.ldt.entry_count = 0;
-}
-
-SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
- unsigned long , bytecount)
-{
- /* See non-um modify_ldt() for why we do this cast */
- return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount);
-}
diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c
index 49c3744cac37..e80ab7d28117 100644
--- a/arch/x86/um/os-Linux/mcontext.c
+++ b/arch/x86/um/os-Linux/mcontext.c
@@ -3,6 +3,7 @@
#define __FRAME_OFFSETS
#include <asm/ptrace.h>
#include <sysdep/ptrace.h>
+#include <sysdep/mcontext.h>
void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc)
{
diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c
index df8f4b4bf98b..f3638dd09cec 100644
--- a/arch/x86/um/os-Linux/registers.c
+++ b/arch/x86/um/os-Linux/registers.c
@@ -17,7 +17,7 @@
#include <linux/elf.h>
#include <registers.h>
-int have_xstate_support;
+static int have_xstate_support;
int save_i387_registers(int pid, unsigned long *fp_regs)
{
diff --git a/arch/x86/um/os-Linux/tls.c b/arch/x86/um/os-Linux/tls.c
index 3e1b1bf6acbc..eed9efe29ade 100644
--- a/arch/x86/um/os-Linux/tls.c
+++ b/arch/x86/um/os-Linux/tls.c
@@ -6,6 +6,7 @@
#include <sys/syscall.h>
#include <unistd.h>
+#include <os.h>
#include <sysdep/tls.h>
#ifndef PTRACE_GET_THREAD_AREA
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index 7f1abde2c84b..b0a71c6cdc6e 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -10,8 +10,6 @@
#include <registers.h>
#include <skas.h>
-extern int arch_switch_tls(struct task_struct *to);
-
void arch_switch_to(struct task_struct *to)
{
int err = arch_switch_tls(to);
diff --git a/arch/x86/um/shared/sysdep/archsetjmp.h b/arch/x86/um/shared/sysdep/archsetjmp.h
index 166cedbab926..8c81d1a604a9 100644
--- a/arch/x86/um/shared/sysdep/archsetjmp.h
+++ b/arch/x86/um/shared/sysdep/archsetjmp.h
@@ -1,6 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_UM_SYSDEP_ARCHSETJMP_H
+#define __X86_UM_SYSDEP_ARCHSETJMP_H
+
#ifdef __i386__
#include "archsetjmp_32.h"
#else
#include "archsetjmp_64.h"
#endif
+
+unsigned long get_thread_reg(int reg, jmp_buf *buf);
+
+#endif /* __X86_UM_SYSDEP_ARCHSETJMP_H */
diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h
index a004bffb7b8d..48de3a71f845 100644
--- a/arch/x86/um/shared/sysdep/kernel-offsets.h
+++ b/arch/x86/um/shared/sysdep/kernel-offsets.h
@@ -6,6 +6,9 @@
#include <linux/kbuild.h>
#include <asm/mman.h>
+/* workaround for a warning with -Wmissing-prototypes */
+void foo(void);
+
void foo(void)
{
#include <common-offsets.h>
diff --git a/arch/x86/um/shared/sysdep/stub.h b/arch/x86/um/shared/sysdep/stub.h
index ce0ca46ad383..dc89f4423454 100644
--- a/arch/x86/um/shared/sysdep/stub.h
+++ b/arch/x86/um/shared/sysdep/stub.h
@@ -12,4 +12,4 @@
#endif
extern void stub_segv_handler(int, siginfo_t *, void *);
-extern void stub_clone_handler(void);
+extern void stub_syscall_handler(void);
diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h
index ea8b5a2d67af..0b44a86dd346 100644
--- a/arch/x86/um/shared/sysdep/stub_32.h
+++ b/arch/x86/um/shared/sysdep/stub_32.h
@@ -6,6 +6,7 @@
#ifndef __SYSDEP_STUB_H
#define __SYSDEP_STUB_H
+#include <stddef.h>
#include <asm/ptrace.h>
#include <generated/asm-offsets.h>
@@ -79,33 +80,31 @@ static __always_inline long stub_syscall5(long syscall, long arg1, long arg2,
return ret;
}
-static __always_inline void trap_myself(void)
+static __always_inline long stub_syscall6(long syscall, long arg1, long arg2,
+ long arg3, long arg4, long arg5,
+ long arg6)
{
- __asm("int3");
+ struct syscall_args {
+ int ebx, ebp;
+ } args = { arg1, arg6 };
+ long ret;
+
+ __asm__ volatile ("pushl %%ebp;"
+ "movl 0x4(%%ebx),%%ebp;"
+ "movl (%%ebx),%%ebx;"
+ "int $0x80;"
+ "popl %%ebp"
+ : "=a" (ret)
+ : "0" (syscall), "b" (&args),
+ "c" (arg2), "d" (arg3), "S" (arg4), "D" (arg5)
+ : "memory");
+
+ return ret;
}
-static __always_inline void remap_stack_and_trap(void)
+static __always_inline void trap_myself(void)
{
- __asm__ volatile (
- "movl %%esp,%%ebx ;"
- "andl %0,%%ebx ;"
- "movl %1,%%eax ;"
- "movl %%ebx,%%edi ; addl %2,%%edi ; movl (%%edi),%%edi ;"
- "movl %%ebx,%%ebp ; addl %3,%%ebp ; movl (%%ebp),%%ebp ;"
- "int $0x80 ;"
- "addl %4,%%ebx ; movl %%eax, (%%ebx) ;"
- "int $3"
- : :
- "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1)),
- "g" (STUB_MMAP_NR),
- "g" (UML_STUB_FIELD_FD),
- "g" (UML_STUB_FIELD_OFFSET),
- "g" (UML_STUB_FIELD_CHILD_ERR),
- "c" (STUB_DATA_PAGES * UM_KERN_PAGE_SIZE),
- "d" (PROT_READ | PROT_WRITE),
- "S" (MAP_FIXED | MAP_SHARED)
- :
- "memory");
+ __asm("int3");
}
static __always_inline void *get_stub_data(void)
diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h
index b24168ef0ac4..67f44284f1aa 100644
--- a/arch/x86/um/shared/sysdep/stub_64.h
+++ b/arch/x86/um/shared/sysdep/stub_64.h
@@ -6,6 +6,7 @@
#ifndef __SYSDEP_STUB_H
#define __SYSDEP_STUB_H
+#include <stddef.h>
#include <sysdep/ptrace_user.h>
#include <generated/asm-offsets.h>
#include <linux/stddef.h>
@@ -79,35 +80,25 @@ static __always_inline long stub_syscall5(long syscall, long arg1, long arg2,
return ret;
}
-static __always_inline void trap_myself(void)
+static __always_inline long stub_syscall6(long syscall, long arg1, long arg2,
+ long arg3, long arg4, long arg5,
+ long arg6)
{
- __asm("int3");
+ long ret;
+
+ __asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; movq %7,%%r9 ; "
+ __syscall
+ : "=a" (ret)
+ : "0" (syscall), "D" (arg1), "S" (arg2), "d" (arg3),
+ "g" (arg4), "g" (arg5), "g" (arg6)
+ : __syscall_clobber, "r10", "r8", "r9");
+
+ return ret;
}
-static __always_inline void remap_stack_and_trap(void)
+static __always_inline void trap_myself(void)
{
- __asm__ volatile (
- "movq %0,%%rax ;"
- "movq %%rsp,%%rdi ;"
- "andq %1,%%rdi ;"
- "movq %2,%%r10 ;"
- "movq %%rdi,%%r8 ; addq %3,%%r8 ; movq (%%r8),%%r8 ;"
- "movq %%rdi,%%r9 ; addq %4,%%r9 ; movq (%%r9),%%r9 ;"
- __syscall ";"
- "movq %%rsp,%%rdi ; andq %1,%%rdi ;"
- "addq %5,%%rdi ; movq %%rax, (%%rdi) ;"
- "int3"
- : :
- "g" (STUB_MMAP_NR),
- "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1)),
- "g" (MAP_FIXED | MAP_SHARED),
- "g" (UML_STUB_FIELD_FD),
- "g" (UML_STUB_FIELD_OFFSET),
- "g" (UML_STUB_FIELD_CHILD_ERR),
- "S" (STUB_DATA_PAGES * UM_KERN_PAGE_SIZE),
- "d" (PROT_READ | PROT_WRITE)
- :
- __syscall_clobber, "r10", "r8", "r9");
+ __asm("int3");
}
static __always_inline void *get_stub_data(void)
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 263e1d08f216..2cc8c2309022 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -8,6 +8,7 @@
#include <linux/personality.h>
#include <linux/ptrace.h>
#include <linux/kernel.h>
+#include <linux/syscalls.h>
#include <asm/unistd.h>
#include <linux/uaccess.h>
#include <asm/ucontext.h>
@@ -155,7 +156,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
struct sigcontext __user *from)
{
struct sigcontext sc;
- int err, pid;
+ int err;
/* Always make any pending restarted system calls return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
@@ -201,10 +202,10 @@ static int copy_sc_from_user(struct pt_regs *regs,
#undef GETREG
- pid = userspace_pid[current_thread_info()->cpu];
#ifdef CONFIG_X86_32
if (have_fpx_regs) {
struct user_fxsr_struct fpx;
+ int pid = userspace_pid[current_thread_info()->cpu];
err = copy_from_user(&fpx,
&((struct _fpstate __user *)sc.fpstate)->_fxsr_env[0],
@@ -240,7 +241,7 @@ static int copy_sc_to_user(struct sigcontext __user *to,
{
struct sigcontext sc;
struct faultinfo * fi = &current->thread.arch.faultinfo;
- int err, pid;
+ int err;
memset(&sc, 0, sizeof(struct sigcontext));
#define PUTREG(regno, regname) sc.regname = regs->regs.gp[HOST_##regno]
@@ -288,10 +289,9 @@ static int copy_sc_to_user(struct sigcontext __user *to,
if (err)
return 1;
- pid = userspace_pid[current_thread_info()->cpu];
-
#ifdef CONFIG_X86_32
if (have_fpx_regs) {
+ int pid = userspace_pid[current_thread_info()->cpu];
struct user_fxsr_struct fpx;
err = save_fpx_registers(pid, (unsigned long *) &fpx);
@@ -450,7 +450,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
return 0;
}
-long sys_sigreturn(void)
+SYSCALL_DEFINE0(sigreturn)
{
unsigned long sp = PT_REGS_SP(&current->thread.regs);
struct sigframe __user *frame = (struct sigframe __user *)(sp - 8);
@@ -557,7 +557,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
}
#endif
-long sys_rt_sigreturn(void)
+SYSCALL_DEFINE0(rt_sigreturn)
{
unsigned long sp = PT_REGS_SP(&current->thread.regs);
struct rt_sigframe __user *frame =
diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S
deleted file mode 100644
index 8291899e6aaf..000000000000
--- a/arch/x86/um/stub_32.S
+++ /dev/null
@@ -1,56 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <as-layout.h>
-
-.section .__syscall_stub, "ax"
-
- .globl batch_syscall_stub
-batch_syscall_stub:
- /* %esp comes in as "top of page" */
- mov %esp, %ecx
- /* %esp has pointer to first operation */
- add $8, %esp
-again:
- /* load length of additional data */
- mov 0x0(%esp), %eax
-
- /* if(length == 0) : end of list */
- /* write possible 0 to header */
- mov %eax, 0x4(%ecx)
- cmpl $0, %eax
- jz done
-
- /* save current pointer */
- mov %esp, 0x4(%ecx)
-
- /* skip additional data */
- add %eax, %esp
-
- /* load syscall-# */
- pop %eax
-
- /* load syscall params */
- pop %ebx
- pop %ecx
- pop %edx
- pop %esi
- pop %edi
- pop %ebp
-
- /* execute syscall */
- int $0x80
-
- /* restore top of page pointer in %ecx */
- mov %esp, %ecx
- andl $(~UM_KERN_PAGE_SIZE) + 1, %ecx
-
- /* check return value */
- pop %ebx
- cmp %ebx, %eax
- je again
-
-done:
- /* save return value */
- mov %eax, (%ecx)
-
- /* stop */
- int3
diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S
deleted file mode 100644
index f3404640197a..000000000000
--- a/arch/x86/um/stub_64.S
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <as-layout.h>
-
-.section .__syscall_stub, "ax"
- .globl batch_syscall_stub
-batch_syscall_stub:
- /* %rsp has the pointer to first operation */
- mov %rsp, %rbx
- add $0x10, %rsp
-again:
- /* load length of additional data */
- mov 0x0(%rsp), %rax
-
- /* if(length == 0) : end of list */
- /* write possible 0 to header */
- mov %rax, 8(%rbx)
- cmp $0, %rax
- jz done
-
- /* save current pointer */
- mov %rsp, 8(%rbx)
-
- /* skip additional data */
- add %rax, %rsp
-
- /* load syscall-# */
- pop %rax
-
- /* load syscall params */
- pop %rdi
- pop %rsi
- pop %rdx
- pop %r10
- pop %r8
- pop %r9
-
- /* execute syscall */
- syscall
-
- /* check return value */
- pop %rcx
- cmp %rcx, %rax
- je again
-
-done:
- /* save return value */
- mov %rax, (%rbx)
-
- /* stop */
- int3
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 89df5d89d664..51655133eee3 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -9,6 +9,10 @@
#include <linux/cache.h>
#include <asm/syscall.h>
+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
+
/*
* Below you can see, in terms of #define's, the differences between the x86-64
* and the UML syscall table.
@@ -22,15 +26,13 @@
#define sys_vm86 sys_ni_syscall
#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native)
+#define __SYSCALL_NORETURN __SYSCALL
#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_32.h>
+#undef __SYSCALL
-#undef __SYSCALL
#define __SYSCALL(nr, sym) sym,
-
-extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
-
const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index b0b4cfd2308c..943d414f2109 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -9,6 +9,10 @@
#include <linux/cache.h>
#include <asm/syscall.h>
+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
+
/*
* Below you can see, in terms of #define's, the differences between the x86-64
* and the UML syscall table.
@@ -18,14 +22,13 @@
#define sys_iopl sys_ni_syscall
#define sys_ioperm sys_ni_syscall
+#define __SYSCALL_NORETURN __SYSCALL
+
#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_64.h>
+#undef __SYSCALL
-#undef __SYSCALL
#define __SYSCALL(nr, sym) sym,
-
-extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
-
const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
#include <asm/syscalls_64.h>
};
diff --git a/arch/x86/um/sysrq_32.c b/arch/x86/um/sysrq_32.c
index f2383484840d..a1ee415c008d 100644
--- a/arch/x86/um/sysrq_32.c
+++ b/arch/x86/um/sysrq_32.c
@@ -9,7 +9,6 @@
#include <linux/sched/debug.h>
#include <linux/kallsyms.h>
#include <asm/ptrace.h>
-#include <asm/sysrq.h>
/* This is declared by <linux/sched.h> */
void show_regs(struct pt_regs *regs)
diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c
index 0bf6de40abff..340d8a243c8a 100644
--- a/arch/x86/um/sysrq_64.c
+++ b/arch/x86/um/sysrq_64.c
@@ -12,7 +12,6 @@
#include <linux/utsname.h>
#include <asm/current.h>
#include <asm/ptrace.h>
-#include <asm/sysrq.h>
void show_regs(struct pt_regs *regs)
{
diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
index 66162eafd8e8..fbb129023080 100644
--- a/arch/x86/um/tls_32.c
+++ b/arch/x86/um/tls_32.c
@@ -11,6 +11,7 @@
#include <os.h>
#include <skas.h>
#include <sysdep/tls.h>
+#include <asm/desc.h>
/*
* If needed we can detect when it's uninitialized.
@@ -20,7 +21,7 @@
static int host_supports_tls = -1;
int host_gdt_entry_tls_min;
-int do_set_thread_area(struct user_desc *info)
+static int do_set_thread_area(struct user_desc *info)
{
int ret;
u32 cpu;
@@ -36,22 +37,6 @@ int do_set_thread_area(struct user_desc *info)
return ret;
}
-int do_get_thread_area(struct user_desc *info)
-{
- int ret;
- u32 cpu;
-
- cpu = get_cpu();
- ret = os_get_thread_area(info, userspace_pid[cpu]);
- put_cpu();
-
- if (ret)
- printk(KERN_ERR "PTRACE_GET_THREAD_AREA failed, err = %d, "
- "index = %d\n", ret, info->entry_number);
-
- return ret;
-}
-
/*
* sys_get_thread_area: get a yet unused TLS descriptor index.
* XXX: Consider leaving one free slot for glibc usage at first place. This must
@@ -231,7 +216,6 @@ out:
return ret;
}
-/* XXX: use do_get_thread_area to read the host value? I'm not at all sure! */
static int get_tls_entry(struct task_struct *task, struct user_desc *info,
int idx)
{
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index e54a9814ccf1..1c77d9946199 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -14,6 +14,9 @@
COMMENT(#val " / sizeof(unsigned long)"); \
DEFINE(sym, val / sizeof(unsigned long))
+/* workaround for a warning with -Wmissing-prototypes */
+void foo(void);
+
void foo(void)
{
#ifdef __i386__
diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile
index b86d634730b2..6a77ea6434ff 100644
--- a/arch/x86/um/vdso/Makefile
+++ b/arch/x86/um/vdso/Makefile
@@ -3,12 +3,6 @@
# Building vDSO images for x86.
#
-# do not instrument on vdso because KASAN is not compatible with user mode
-KASAN_SANITIZE := n
-
-# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
-KCOV_INSTRUMENT := n
-
VDSO64-y := y
vdso-install-$(VDSO64-y) += vdso.so
@@ -63,7 +57,6 @@ quiet_cmd_vdso = VDSO $@
cmd_vdso = $(CC) -nostdlib -o $@ \
$(CC_FLAGS_LTO) $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
-Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
- sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
+ sh $(src)/checkundef.sh '$(NM)' '$@'
VDSO_LDFLAGS = -fPIC -shared -Wl,--hash-style=sysv -z noexecstack
-GCOV_PROFILE := n
diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c
index ff0f3b4b6c45..cbae2584124f 100644
--- a/arch/x86/um/vdso/um_vdso.c
+++ b/arch/x86/um/vdso/um_vdso.c
@@ -13,6 +13,12 @@
#include <linux/getcpu.h>
#include <asm/unistd.h>
+/* workaround for -Wmissing-prototypes warnings */
+int __vdso_clock_gettime(clockid_t clock, struct __kernel_old_timespec *ts);
+int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
+__kernel_old_time_t __vdso_time(__kernel_old_time_t *t);
+long __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused);
+
int __vdso_clock_gettime(clockid_t clock, struct __kernel_old_timespec *ts)
{
long ret;
@@ -54,7 +60,7 @@ __kernel_old_time_t __vdso_time(__kernel_old_time_t *t)
__kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__vdso_time")));
long
-__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+__vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused)
{
/*
* UML does not support SMP, we can cheat here. :)
@@ -68,5 +74,5 @@ __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
return 0;
}
-long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+long getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *tcache)
__attribute__((weak, alias("__vdso_getcpu")));
diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c
index 76d9f6ce7a3d..f238f7b33cdd 100644
--- a/arch/x86/um/vdso/vma.c
+++ b/arch/x86/um/vdso/vma.c
@@ -52,8 +52,11 @@ subsys_initcall(init_vdso);
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
- int err;
+ struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
+ static struct vm_special_mapping vdso_mapping = {
+ .name = "[vdso]",
+ };
if (!vdso_enabled)
return 0;
@@ -61,12 +64,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
if (mmap_write_lock_killable(mm))
return -EINTR;
- err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE,
+ vdso_mapping.pages = vdsop;
+ vma = _install_special_mapping(mm, um_vdso_addr, PAGE_SIZE,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
- vdsop);
+ &vdso_mapping);
mmap_write_unlock(mm);
- return err;
+ return IS_ERR(vma) ? PTR_ERR(vma) : 0;
}
diff --git a/arch/x86/video/Makefile b/arch/x86/video/Makefile
index 5ebe48752ffc..dcfbe7a5912c 100644
--- a/arch/x86/video/Makefile
+++ b/arch/x86/video/Makefile
@@ -1,2 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_FB_CORE) += fbdev.o
+
+obj-y += video-common.o
diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/video-common.c
index 1dd6528cc947..81fc97a2a837 100644
--- a/arch/x86/video/fbdev.c
+++ b/arch/x86/video/video-common.c
@@ -7,11 +7,11 @@
*
*/
-#include <linux/fb.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/vgaarb.h>
-#include <asm/fb.h>
+
+#include <asm/video.h>
pgprot_t pgprot_framebuffer(pgprot_t prot,
unsigned long vm_start, unsigned long vm_end,
@@ -25,20 +25,17 @@ pgprot_t pgprot_framebuffer(pgprot_t prot,
}
EXPORT_SYMBOL(pgprot_framebuffer);
-int fb_is_primary_device(struct fb_info *info)
+bool video_is_primary_device(struct device *dev)
{
- struct device *device = info->device;
- struct pci_dev *pci_dev;
+ struct pci_dev *pdev;
- if (!device || !dev_is_pci(device))
- return 0;
+ if (!dev_is_pci(dev))
+ return false;
- pci_dev = to_pci_dev(device);
+ pdev = to_pci_dev(dev);
- if (pci_dev == vga_default_device())
- return 1;
- return 0;
+ return (pdev == vga_default_device());
}
-EXPORT_SYMBOL(fb_is_primary_device);
+EXPORT_SYMBOL(video_is_primary_device);
MODULE_LICENSE("GPL");
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index 0ae10535c699..0ce17766c0e5 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -120,7 +120,7 @@ static __init void snp_enable(void *arg)
bool snp_probe_rmptable_info(void)
{
- u64 max_rmp_pfn, calc_rmp_sz, rmp_sz, rmp_base, rmp_end;
+ u64 rmp_sz, rmp_base, rmp_end;
rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
rdmsrl(MSR_AMD64_RMP_END, rmp_end);
@@ -137,28 +137,11 @@ bool snp_probe_rmptable_info(void)
rmp_sz = rmp_end - rmp_base + 1;
- /*
- * Calculate the amount the memory that must be reserved by the BIOS to
- * address the whole RAM, including the bookkeeping area. The RMP itself
- * must also be covered.
- */
- max_rmp_pfn = max_pfn;
- if (PHYS_PFN(rmp_end) > max_pfn)
- max_rmp_pfn = PHYS_PFN(rmp_end);
-
- calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
-
- if (calc_rmp_sz > rmp_sz) {
- pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
- calc_rmp_sz, rmp_sz);
- return false;
- }
-
probed_rmp_base = rmp_base;
probed_rmp_size = rmp_sz;
pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
- probed_rmp_base, probed_rmp_base + probed_rmp_size - 1);
+ rmp_base, rmp_end);
return true;
}
@@ -206,9 +189,8 @@ void __init snp_fixup_e820_tables(void)
*/
static int __init snp_rmptable_init(void)
{
+ u64 max_rmp_pfn, calc_rmp_sz, rmptable_size, rmp_end, val;
void *rmptable_start;
- u64 rmptable_size;
- u64 val;
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
return 0;
@@ -219,10 +201,28 @@ static int __init snp_rmptable_init(void)
if (!probed_rmp_size)
goto nosnp;
+ rmp_end = probed_rmp_base + probed_rmp_size - 1;
+
+ /*
+ * Calculate the amount the memory that must be reserved by the BIOS to
+ * address the whole RAM, including the bookkeeping area. The RMP itself
+ * must also be covered.
+ */
+ max_rmp_pfn = max_pfn;
+ if (PFN_UP(rmp_end) > max_pfn)
+ max_rmp_pfn = PFN_UP(rmp_end);
+
+ calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
+ if (calc_rmp_sz > probed_rmp_size) {
+ pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
+ calc_rmp_sz, probed_rmp_size);
+ goto nosnp;
+ }
+
rmptable_start = memremap(probed_rmp_base, probed_rmp_size, MEMREMAP_WB);
if (!rmptable_start) {
pr_err("Failed to map RMP table\n");
- return 1;
+ goto nosnp;
}
/*
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 4d6826a76f78..4e2b2e2ac9f9 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -27,14 +27,13 @@
#include <linux/log2.h>
#include <linux/acpi.h>
#include <linux/suspend.h>
-#include <linux/acpi.h>
#include <asm/page.h>
#include <asm/special_insns.h>
#include <asm/msr-index.h>
#include <asm/msr.h>
#include <asm/cpufeature.h>
#include <asm/tdx.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/mce.h>
#include "tdx.h"
@@ -1427,9 +1426,9 @@ static void __init check_tdx_erratum(void)
* private memory poisons that memory, and a subsequent read of
* that memory triggers #MC.
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_SAPPHIRERAPIDS_X:
- case INTEL_FAM6_EMERALDRAPIDS_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_SAPPHIRERAPIDS_X:
+ case INTEL_EMERALDRAPIDS_X:
setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
}
}
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 8b045dd25196..bb0f3f368446 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -10,8 +10,6 @@
#include <xen/xen.h>
#include <xen/interface/physdev.h>
#include "xen-ops.h"
-#include "pmu.h"
-#include "smp.h"
static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
{
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 532410998684..b8c9f2a7d9b6 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -3,7 +3,7 @@
#include <linux/debugfs.h>
#include <linux/slab.h>
-#include "debugfs.h"
+#include "xen-ops.h"
static struct dentry *d_xen_debug;
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
deleted file mode 100644
index 6b813ad1091c..000000000000
--- a/arch/x86/xen/debugfs.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _XEN_DEBUGFS_H
-#define _XEN_DEBUGFS_H
-
-struct dentry * __init xen_init_debugfs(void);
-
-#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a01ca255b0c6..84e5adbd0925 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1,8 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
-#include <linux/memblock.h>
-#endif
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/kexec.h>
@@ -23,8 +20,6 @@
#include <asm/setup.h>
#include "xen-ops.h"
-#include "smp.h"
-#include "pmu.h"
EXPORT_SYMBOL_GPL(hypercall_page);
@@ -382,3 +377,36 @@ void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns)
memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
}
+
+#ifdef CONFIG_XEN_UNPOPULATED_ALLOC
+int __init arch_xen_unpopulated_init(struct resource **res)
+{
+ unsigned int i;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ /* Must be set strictly before calling xen_free_unpopulated_pages(). */
+ *res = &iomem_resource;
+
+ /*
+ * Initialize with pages from the extra memory regions (see
+ * arch/x86/xen/setup.c).
+ */
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ unsigned int j;
+
+ for (j = 0; j < xen_extra_mem[i].n_pfns; j++) {
+ struct page *pg =
+ pfn_to_page(xen_extra_mem[i].start_pfn + j);
+
+ xen_free_unpopulated_pages(1, &pg);
+ }
+
+ /* Zero so region is not also added to the balloon driver. */
+ xen_extra_mem[i].n_pfns = 0;
+ }
+
+ return 0;
+}
+#endif
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index c001a2296582..24d2957a4726 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -28,8 +28,6 @@
#include <asm/xen/page.h>
#include "xen-ops.h"
-#include "mmu.h"
-#include "smp.h"
static unsigned long shared_info_pfn;
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 9ba53814ed6a..2c12ae42dc8b 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -85,10 +85,6 @@
#endif
#include "xen-ops.h"
-#include "mmu.h"
-#include "smp.h"
-#include "multicalls.h"
-#include "pmu.h"
#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index 27a2a02ef8fb..bf68c329fc01 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -4,11 +4,13 @@
#include <linux/mm.h>
#include <xen/hvc-console.h>
+#include <xen/acpi.h>
#include <asm/bootparam.h>
#include <asm/io_apic.h>
#include <asm/hypervisor.h>
#include <asm/e820/api.h>
+#include <asm/setup.h>
#include <xen/xen.h>
#include <asm/xen/interface.h>
@@ -27,53 +29,27 @@
bool __ro_after_init xen_pvh;
EXPORT_SYMBOL_GPL(xen_pvh);
-void __init xen_pvh_init(struct boot_params *boot_params)
+#ifdef CONFIG_XEN_DOM0
+int xen_pvh_setup_gsi(int gsi, int trigger, int polarity)
{
- u32 msr;
- u64 pfn;
-
- xen_pvh = 1;
- xen_domain_type = XEN_HVM_DOMAIN;
- xen_start_flags = pvh_start_info.flags;
-
- msr = cpuid_ebx(xen_cpuid_base() + 2);
- pfn = __pa(hypercall_page);
- wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+ int ret;
+ struct physdev_setup_gsi setup_gsi;
- if (xen_initial_domain())
- x86_init.oem.arch_setup = xen_add_preferred_consoles;
- x86_init.oem.banner = xen_banner;
-
- xen_efi_init(boot_params);
-
- if (xen_initial_domain()) {
- struct xen_platform_op op = {
- .cmd = XENPF_get_dom0_console,
- };
- int ret = HYPERVISOR_platform_op(&op);
-
- if (ret > 0)
- xen_init_vga(&op.u.dom0_console,
- min(ret * sizeof(char),
- sizeof(op.u.dom0_console)),
- &boot_params->screen_info);
- }
-}
+ setup_gsi.gsi = gsi;
+ setup_gsi.triggering = (trigger == ACPI_EDGE_SENSITIVE ? 0 : 1);
+ setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-void __init mem_map_via_hcall(struct boot_params *boot_params_p)
-{
- struct xen_memory_map memmap;
- int rc;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+ if (ret == -EEXIST) {
+ xen_raw_printk("Already setup the GSI :%d\n", gsi);
+ ret = 0;
+ } else if (ret)
+ xen_raw_printk("Fail to setup GSI (%d)!\n", gsi);
- memmap.nr_entries = ARRAY_SIZE(boot_params_p->e820_table);
- set_xen_guest_handle(memmap.buffer, boot_params_p->e820_table);
- rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
- if (rc) {
- xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
- BUG();
- }
- boot_params_p->e820_entries = memmap.nr_entries;
+ return ret;
}
+EXPORT_SYMBOL_GPL(xen_pvh_setup_gsi);
+#endif
/*
* Reserve e820 UNUSABLE regions to inflate the memory balloon.
@@ -89,8 +65,9 @@ void __init mem_map_via_hcall(struct boot_params *boot_params_p)
* hypervisor should notify us which memory ranges are suitable for creating
* foreign mappings, but that's not yet implemented.
*/
-void __init xen_reserve_extra_memory(struct boot_params *bootp)
+static void __init pvh_reserve_extra_memory(void)
{
+ struct boot_params *bootp = &boot_params;
unsigned int i, ram_pages = 0, extra_pages;
for (i = 0; i < bootp->e820_entries; i++) {
@@ -141,3 +118,58 @@ void __init xen_reserve_extra_memory(struct boot_params *bootp)
xen_add_extra_mem(PFN_UP(e->addr), pages);
}
}
+
+static void __init pvh_arch_setup(void)
+{
+ pvh_reserve_extra_memory();
+
+ if (xen_initial_domain())
+ xen_add_preferred_consoles();
+}
+
+void __init xen_pvh_init(struct boot_params *boot_params)
+{
+ u32 msr;
+ u64 pfn;
+
+ xen_pvh = 1;
+ xen_domain_type = XEN_HVM_DOMAIN;
+ xen_start_flags = pvh_start_info.flags;
+
+ msr = cpuid_ebx(xen_cpuid_base() + 2);
+ pfn = __pa(hypercall_page);
+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+ x86_init.oem.arch_setup = pvh_arch_setup;
+ x86_init.oem.banner = xen_banner;
+
+ xen_efi_init(boot_params);
+
+ if (xen_initial_domain()) {
+ struct xen_platform_op op = {
+ .cmd = XENPF_get_dom0_console,
+ };
+ int ret = HYPERVISOR_platform_op(&op);
+
+ if (ret > 0)
+ xen_init_vga(&op.u.dom0_console,
+ min(ret * sizeof(char),
+ sizeof(op.u.dom0_console)),
+ &boot_params->screen_info);
+ }
+}
+
+void __init mem_map_via_hcall(struct boot_params *boot_params_p)
+{
+ struct xen_memory_map memmap;
+ int rc;
+
+ memmap.nr_entries = ARRAY_SIZE(boot_params_p->e820_table);
+ set_xen_guest_handle(memmap.buffer, boot_params_p->e820_table);
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if (rc) {
+ xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
+ BUG();
+ }
+ boot_params_p->e820_entries = memmap.nr_entries;
+}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 60e9c37fd79f..c4c479373249 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -5,8 +5,7 @@
#include <asm/xen/hypercall.h>
#include <xen/interface/memory.h>
-#include "multicalls.h"
-#include "mmu.h"
+#include "xen-ops.h"
unsigned long arbitrary_virt_to_mfn(void *vaddr)
{
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
deleted file mode 100644
index 6e4c6bd62203..000000000000
--- a/arch/x86/xen/mmu.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _XEN_MMU_H
-
-#include <linux/linkage.h>
-#include <asm/page.h>
-
-enum pt_level {
- PT_PGD,
- PT_P4D,
- PT_PUD,
- PT_PMD,
- PT_PTE
-};
-
-
-bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-
-void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep);
-void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep, pte_t pte);
-
-unsigned long xen_read_cr2_direct(void);
-
-extern void xen_init_mmu_ops(void);
-extern void xen_hvm_init_mmu_ops(void);
-#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
index 509bdee3ab90..337955652202 100644
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
@@ -5,7 +5,7 @@
#include <xen/interface/xen.h>
#include <xen/hvm.h>
-#include "mmu.h"
+#include "xen-ops.h"
#ifdef CONFIG_PROC_VMCORE
/*
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 54e0d311dcc9..55a4996d0c04 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -82,9 +82,7 @@
#include <xen/hvc-console.h>
#include <xen/swiotlb-xen.h>
-#include "multicalls.h"
-#include "mmu.h"
-#include "debugfs.h"
+#include "xen-ops.h"
/*
* Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order
@@ -128,7 +126,7 @@ static DEFINE_SPINLOCK(xen_reservation_lock);
* looking at another vcpu's cr3 value, it should use this variable.
*/
DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
-DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
+static DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
static phys_addr_t xen_pt_base, xen_pt_size __initdata;
@@ -305,16 +303,17 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval)
__xen_set_pte(ptep, pteval);
}
-pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+static pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
/* Just return the pte as-is. We preserve the bits on commit */
trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep);
return *ptep;
}
-void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep, pte_t pte)
+static void xen_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr,
+ pte_t *ptep, pte_t pte)
{
struct mmu_update u;
@@ -666,7 +665,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
{
spinlock_t *ptl = NULL;
-#if USE_SPLIT_PTE_PTLOCKS
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
ptl = ptlock_ptr(page_ptdesc(page));
spin_lock_nest_lock(ptl, &mm->page_table_lock);
#endif
@@ -1554,7 +1553,8 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
__set_pfn_prot(pfn, PAGE_KERNEL_RO);
- if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned)
+ if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS) &&
+ !pinned)
__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
xen_mc_issue(XEN_LAZY_MMU);
@@ -1582,7 +1582,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
if (pinned) {
xen_mc_batch();
- if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
+ if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS))
__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
__set_pfn_prot(pfn, PAGE_KERNEL);
@@ -2019,10 +2019,7 @@ void __init xen_reserve_special_pages(void)
void __init xen_pt_check_e820(void)
{
- if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
- xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
- BUG();
- }
+ xen_chk_is_e820_usable(xen_pt_base, xen_pt_size, "page table");
}
static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 07054572297f..10c660fae8b3 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -23,26 +23,21 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/debugfs.h>
+#include <linux/jump_label.h>
+#include <linux/printk.h>
#include <asm/xen/hypercall.h>
-#include "multicalls.h"
-#include "debugfs.h"
+#include "xen-ops.h"
#define MC_BATCH 32
-#define MC_DEBUG 0
-
#define MC_ARGS (MC_BATCH * 16)
struct mc_buffer {
unsigned mcidx, argidx, cbidx;
struct multicall_entry entries[MC_BATCH];
-#if MC_DEBUG
- struct multicall_entry debug[MC_BATCH];
- void *caller[MC_BATCH];
-#endif
unsigned char args[MC_ARGS];
struct callback {
void (*fn)(void *);
@@ -50,13 +45,103 @@ struct mc_buffer {
} callbacks[MC_BATCH];
};
+struct mc_debug_data {
+ struct multicall_entry entries[MC_BATCH];
+ void *caller[MC_BATCH];
+ size_t argsz[MC_BATCH];
+ unsigned long *args[MC_BATCH];
+};
+
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+static struct mc_debug_data mc_debug_data_early __initdata;
+static DEFINE_PER_CPU(struct mc_debug_data *, mc_debug_data) =
+ &mc_debug_data_early;
+static struct mc_debug_data __percpu *mc_debug_data_ptr;
DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+static struct static_key mc_debug __ro_after_init;
+static bool mc_debug_enabled __initdata;
+
+static int __init xen_parse_mc_debug(char *arg)
+{
+ mc_debug_enabled = true;
+ static_key_slow_inc(&mc_debug);
+
+ return 0;
+}
+early_param("xen_mc_debug", xen_parse_mc_debug);
+
+void mc_percpu_init(unsigned int cpu)
+{
+ per_cpu(mc_debug_data, cpu) = per_cpu_ptr(mc_debug_data_ptr, cpu);
+}
+
+static int __init mc_debug_enable(void)
+{
+ unsigned long flags;
+
+ if (!mc_debug_enabled)
+ return 0;
+
+ mc_debug_data_ptr = alloc_percpu(struct mc_debug_data);
+ if (!mc_debug_data_ptr) {
+ pr_err("xen_mc_debug inactive\n");
+ static_key_slow_dec(&mc_debug);
+ return -ENOMEM;
+ }
+
+ /* Be careful when switching to percpu debug data. */
+ local_irq_save(flags);
+ xen_mc_flush();
+ mc_percpu_init(0);
+ local_irq_restore(flags);
+
+ pr_info("xen_mc_debug active\n");
+
+ return 0;
+}
+early_initcall(mc_debug_enable);
+
+/* Number of parameters of hypercalls used via multicalls. */
+static const uint8_t hpcpars[] = {
+ [__HYPERVISOR_mmu_update] = 4,
+ [__HYPERVISOR_stack_switch] = 2,
+ [__HYPERVISOR_fpu_taskswitch] = 1,
+ [__HYPERVISOR_update_descriptor] = 2,
+ [__HYPERVISOR_update_va_mapping] = 3,
+ [__HYPERVISOR_mmuext_op] = 4,
+};
+
+static void print_debug_data(struct mc_buffer *b, struct mc_debug_data *mcdb,
+ int idx)
+{
+ unsigned int arg;
+ unsigned int opidx = mcdb->entries[idx].op & 0xff;
+ unsigned int pars = 0;
+
+ pr_err(" call %2d: op=%lu result=%ld caller=%pS ", idx + 1,
+ mcdb->entries[idx].op, b->entries[idx].result,
+ mcdb->caller[idx]);
+ if (opidx < ARRAY_SIZE(hpcpars))
+ pars = hpcpars[opidx];
+ if (pars) {
+ pr_cont("pars=");
+ for (arg = 0; arg < pars; arg++)
+ pr_cont("%lx ", mcdb->entries[idx].args[arg]);
+ }
+ if (mcdb->argsz[idx]) {
+ pr_cont("args=");
+ for (arg = 0; arg < mcdb->argsz[idx] / 8; arg++)
+ pr_cont("%lx ", mcdb->args[idx][arg]);
+ }
+ pr_cont("\n");
+}
+
void xen_mc_flush(void)
{
struct mc_buffer *b = this_cpu_ptr(&mc_buffer);
struct multicall_entry *mc;
+ struct mc_debug_data *mcdb = NULL;
int ret = 0;
unsigned long flags;
int i;
@@ -69,10 +154,11 @@ void xen_mc_flush(void)
trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
-#if MC_DEBUG
- memcpy(b->debug, b->entries,
- b->mcidx * sizeof(struct multicall_entry));
-#endif
+ if (static_key_false(&mc_debug)) {
+ mcdb = __this_cpu_read(mc_debug_data);
+ memcpy(mcdb->entries, b->entries,
+ b->mcidx * sizeof(struct multicall_entry));
+ }
switch (b->mcidx) {
case 0:
@@ -103,21 +189,14 @@ void xen_mc_flush(void)
pr_err("%d of %d multicall(s) failed: cpu %d\n",
ret, b->mcidx, smp_processor_id());
for (i = 0; i < b->mcidx; i++) {
- if (b->entries[i].result < 0) {
-#if MC_DEBUG
- pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pS\n",
- i + 1,
- b->debug[i].op,
- b->debug[i].args[0],
- b->entries[i].result,
- b->caller[i]);
-#else
+ if (static_key_false(&mc_debug)) {
+ print_debug_data(b, mcdb, i);
+ } else if (b->entries[i].result < 0) {
pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\n",
i + 1,
b->entries[i].op,
b->entries[i].args[0],
b->entries[i].result);
-#endif
}
}
}
@@ -155,9 +234,13 @@ struct multicall_space __xen_mc_entry(size_t args)
}
ret.mc = &b->entries[b->mcidx];
-#if MC_DEBUG
- b->caller[b->mcidx] = __builtin_return_address(0);
-#endif
+ if (static_key_false(&mc_debug)) {
+ struct mc_debug_data *mcdb = __this_cpu_read(mc_debug_data);
+
+ mcdb->caller[b->mcidx] = __builtin_return_address(0);
+ mcdb->argsz[b->mcidx] = args;
+ mcdb->args[b->mcidx] = (unsigned long *)(&b->args[argidx]);
+ }
b->mcidx++;
ret.args = &b->args[argidx];
b->argidx = argidx + args;
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
deleted file mode 100644
index c3867b585e0d..000000000000
--- a/arch/x86/xen/multicalls.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _XEN_MULTICALLS_H
-#define _XEN_MULTICALLS_H
-
-#include <trace/events/xen.h>
-
-#include "xen-ops.h"
-
-/* Multicalls */
-struct multicall_space
-{
- struct multicall_entry *mc;
- void *args;
-};
-
-/* Allocate room for a multicall and its args */
-struct multicall_space __xen_mc_entry(size_t args);
-
-DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
-
-/* Call to start a batch of multiple __xen_mc_entry()s. Must be
- paired with xen_mc_issue() */
-static inline void xen_mc_batch(void)
-{
- unsigned long flags;
-
- /* need to disable interrupts until this entry is complete */
- local_irq_save(flags);
- trace_xen_mc_batch(xen_get_lazy_mode());
- __this_cpu_write(xen_mc_irq_flags, flags);
-}
-
-static inline struct multicall_space xen_mc_entry(size_t args)
-{
- xen_mc_batch();
- return __xen_mc_entry(args);
-}
-
-/* Flush all pending multicalls */
-void xen_mc_flush(void);
-
-/* Issue a multicall if we're not in a lazy mode */
-static inline void xen_mc_issue(unsigned mode)
-{
- trace_xen_mc_issue(mode);
-
- if ((xen_get_lazy_mode() & mode) == 0)
- xen_mc_flush();
-
- /* restore flags saved in xen_mc_batch */
- local_irq_restore(this_cpu_read(xen_mc_irq_flags));
-}
-
-/* Set up a callback to be called when the current batch is flushed */
-void xen_mc_callback(void (*fn)(void *), void *data);
-
-/*
- * Try to extend the arguments of the previous multicall command. The
- * previous command's op must match. If it does, then it attempts to
- * extend the argument space allocated to the multicall entry by
- * arg_size bytes.
- *
- * The returned multicall_space will return with mc pointing to the
- * command on success, or NULL on failure, and args pointing to the
- * newly allocated space.
- */
-struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
-
-#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 9bdc3b656b2c..b52d3e17e2c1 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -70,6 +70,7 @@
#include <linux/memblock.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/acpi.h>
#include <asm/cache.h>
#include <asm/setup.h>
@@ -80,8 +81,8 @@
#include <asm/xen/hypervisor.h>
#include <xen/balloon.h>
#include <xen/grant_table.h>
+#include <xen/hvc-console.h>
-#include "multicalls.h"
#include "xen-ops.h"
#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
@@ -555,7 +556,6 @@ int xen_alloc_p2m_entry(unsigned long pfn)
/* Separately check the mid mfn level */
unsigned long missing_mfn;
unsigned long mid_mfn_mfn;
- unsigned long old_mfn;
mid_mfn = alloc_p2m_page();
if (!mid_mfn)
@@ -565,12 +565,12 @@ int xen_alloc_p2m_entry(unsigned long pfn)
missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
mid_mfn_mfn = virt_to_mfn(mid_mfn);
- old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn);
- if (old_mfn != missing_mfn) {
- free_p2m_page(mid_mfn);
- mid_mfn = mfn_to_virt(old_mfn);
- } else {
+ /* try_cmpxchg() updates missing_mfn on failure. */
+ if (try_cmpxchg(top_mfn_p, &missing_mfn, mid_mfn_mfn)) {
p2m_top_mfn_p[topidx] = mid_mfn;
+ } else {
+ free_p2m_page(mid_mfn);
+ mid_mfn = mfn_to_virt(missing_mfn);
}
}
} else {
@@ -731,7 +731,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
* immediate unmapping.
*/
map_ops[i].status = GNTST_general_error;
- unmap[0].host_addr = map_ops[i].host_addr,
+ unmap[0].host_addr = map_ops[i].host_addr;
unmap[0].handle = map_ops[i].handle;
map_ops[i].handle = INVALID_GRANT_HANDLE;
if (map_ops[i].flags & GNTMAP_device_map)
@@ -741,7 +741,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
if (kmap_ops) {
kmap_ops[i].status = GNTST_general_error;
- unmap[1].host_addr = kmap_ops[i].host_addr,
+ unmap[1].host_addr = kmap_ops[i].host_addr;
unmap[1].handle = kmap_ops[i].handle;
kmap_ops[i].handle = INVALID_GRANT_HANDLE;
if (kmap_ops[i].flags & GNTMAP_device_map)
@@ -794,9 +794,104 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
return ret;
}
+/* Remapped non-RAM areas */
+#define NR_NONRAM_REMAP 4
+static struct nonram_remap {
+ phys_addr_t maddr;
+ phys_addr_t paddr;
+ size_t size;
+} xen_nonram_remap[NR_NONRAM_REMAP] __ro_after_init;
+static unsigned int nr_nonram_remap __ro_after_init;
+
+/*
+ * Do the real remapping of non-RAM regions as specified in the
+ * xen_nonram_remap[] array.
+ * In case of an error just crash the system.
+ */
+void __init xen_do_remap_nonram(void)
+{
+ unsigned int i;
+ unsigned int remapped = 0;
+ const struct nonram_remap *remap = xen_nonram_remap;
+ unsigned long pfn, mfn, end_pfn;
+
+ for (i = 0; i < nr_nonram_remap; i++) {
+ end_pfn = PFN_UP(remap->paddr + remap->size);
+ pfn = PFN_DOWN(remap->paddr);
+ mfn = PFN_DOWN(remap->maddr);
+ while (pfn < end_pfn) {
+ if (!set_phys_to_machine(pfn, mfn))
+ panic("Failed to set p2m mapping for pfn=%lx mfn=%lx\n",
+ pfn, mfn);
+
+ pfn++;
+ mfn++;
+ remapped++;
+ }
+
+ remap++;
+ }
+
+ pr_info("Remapped %u non-RAM page(s)\n", remapped);
+}
+
+#ifdef CONFIG_ACPI
+/*
+ * Xen variant of acpi_os_ioremap() taking potentially remapped non-RAM
+ * regions into account.
+ * Any attempt to map an area crossing a remap boundary will produce a
+ * WARN() splat.
+ * phys is related to remap->maddr on input and will be rebased to remap->paddr.
+ */
+static void __iomem *xen_acpi_os_ioremap(acpi_physical_address phys,
+ acpi_size size)
+{
+ unsigned int i;
+ const struct nonram_remap *remap = xen_nonram_remap;
+
+ for (i = 0; i < nr_nonram_remap; i++) {
+ if (phys + size > remap->maddr &&
+ phys < remap->maddr + remap->size) {
+ WARN_ON(phys < remap->maddr ||
+ phys + size > remap->maddr + remap->size);
+ phys += remap->paddr - remap->maddr;
+ break;
+ }
+ }
+
+ return x86_acpi_os_ioremap(phys, size);
+}
+#endif /* CONFIG_ACPI */
+
+/*
+ * Add a new non-RAM remap entry.
+ * In case of no free entry found, just crash the system.
+ */
+void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr,
+ unsigned long size)
+{
+ BUG_ON((maddr & ~PAGE_MASK) != (paddr & ~PAGE_MASK));
+
+ if (nr_nonram_remap == NR_NONRAM_REMAP) {
+ xen_raw_console_write("Number of required E820 entry remapping actions exceed maximum value\n");
+ BUG();
+ }
+
+#ifdef CONFIG_ACPI
+ /* Switch to the Xen acpi_os_ioremap() variant. */
+ if (nr_nonram_remap == 0)
+ acpi_os_ioremap = xen_acpi_os_ioremap;
+#endif
+
+ xen_nonram_remap[nr_nonram_remap].maddr = maddr;
+ xen_nonram_remap[nr_nonram_remap].paddr = paddr;
+ xen_nonram_remap[nr_nonram_remap].size = size;
+
+ nr_nonram_remap++;
+}
+
#ifdef CONFIG_XEN_DEBUG_FS
#include <linux/debugfs.h>
-#include "debugfs.h"
static int p2m_dump_show(struct seq_file *m, void *v)
{
static const char * const type_name[] = {
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 246d67dab510..f06987b0efc3 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -10,7 +10,6 @@
#include <xen/interface/xenpmu.h>
#include "xen-ops.h"
-#include "pmu.h"
/* x86_pmu.handle_irq definition */
#include "../events/perf_event.h"
diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h
deleted file mode 100644
index 65c58894fc79..000000000000
--- a/arch/x86/xen/pmu.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __XEN_PMU_H
-#define __XEN_PMU_H
-
-#include <xen/interface/xenpmu.h>
-
-extern bool is_xen_pmu;
-
-irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
-#ifdef CONFIG_XEN_HAVE_VPMU
-void xen_pmu_init(int cpu);
-void xen_pmu_finish(int cpu);
-#else
-static inline void xen_pmu_init(int cpu) {}
-static inline void xen_pmu_finish(int cpu) {}
-#endif
-bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
-bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
-int pmu_apic_update(uint32_t reg);
-unsigned long long xen_read_pmc(int counter);
-
-#endif /* __XEN_PMU_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 380591028cb8..c3db71d96c43 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -15,12 +15,12 @@
#include <linux/cpuidle.h>
#include <linux/cpufreq.h>
#include <linux/memory_hotplug.h>
+#include <linux/acpi.h>
#include <asm/elf.h>
#include <asm/vdso.h>
#include <asm/e820/api.h>
#include <asm/setup.h>
-#include <asm/acpi.h>
#include <asm/numa.h>
#include <asm/idtentry.h>
#include <asm/xen/hypervisor.h>
@@ -34,7 +34,6 @@
#include <xen/features.h>
#include <xen/hvc-console.h>
#include "xen-ops.h"
-#include "mmu.h"
#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
@@ -47,6 +46,9 @@ bool xen_pv_pci_possible;
/* E820 map used during setting up memory. */
static struct e820_table xen_e820_table __initdata;
+/* Number of initially usable memory pages. */
+static unsigned long ini_nr_pages __initdata;
+
/*
* Buffer used to remap identity mapped pages. We only need the virtual space.
* The physical page behind this address is remapped as needed to different
@@ -213,7 +215,7 @@ static int __init xen_free_mfn(unsigned long mfn)
* as a fallback if the remapping fails.
*/
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
- unsigned long end_pfn, unsigned long nr_pages)
+ unsigned long end_pfn)
{
unsigned long pfn, end;
int ret;
@@ -221,7 +223,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
WARN_ON(start_pfn > end_pfn);
/* Release pages first. */
- end = min(end_pfn, nr_pages);
+ end = min(end_pfn, ini_nr_pages);
for (pfn = start_pfn; pfn < end; pfn++) {
unsigned long mfn = pfn_to_mfn(pfn);
@@ -342,15 +344,14 @@ static void __init xen_do_set_identity_and_remap_chunk(
* to Xen and not remapped.
*/
static unsigned long __init xen_set_identity_and_remap_chunk(
- unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
- unsigned long remap_pfn)
+ unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pfn)
{
unsigned long pfn;
unsigned long i = 0;
unsigned long n = end_pfn - start_pfn;
if (remap_pfn == 0)
- remap_pfn = nr_pages;
+ remap_pfn = ini_nr_pages;
while (i < n) {
unsigned long cur_pfn = start_pfn + i;
@@ -359,19 +360,19 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
unsigned long remap_range_size;
/* Do not remap pages beyond the current allocation */
- if (cur_pfn >= nr_pages) {
+ if (cur_pfn >= ini_nr_pages) {
/* Identity map remaining pages */
set_phys_range_identity(cur_pfn, cur_pfn + size);
break;
}
- if (cur_pfn + size > nr_pages)
- size = nr_pages - cur_pfn;
+ if (cur_pfn + size > ini_nr_pages)
+ size = ini_nr_pages - cur_pfn;
remap_range_size = xen_find_pfn_range(&remap_pfn);
if (!remap_range_size) {
pr_warn("Unable to find available pfn range, not remapping identity pages\n");
xen_set_identity_and_release_chunk(cur_pfn,
- cur_pfn + left, nr_pages);
+ cur_pfn + left);
break;
}
/* Adjust size to fit in current e820 RAM region */
@@ -398,18 +399,18 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
}
static unsigned long __init xen_count_remap_pages(
- unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+ unsigned long start_pfn, unsigned long end_pfn,
unsigned long remap_pages)
{
- if (start_pfn >= nr_pages)
+ if (start_pfn >= ini_nr_pages)
return remap_pages;
- return remap_pages + min(end_pfn, nr_pages) - start_pfn;
+ return remap_pages + min(end_pfn, ini_nr_pages) - start_pfn;
}
-static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
+static unsigned long __init xen_foreach_remap_area(
unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn,
- unsigned long nr_pages, unsigned long last_val))
+ unsigned long last_val))
{
phys_addr_t start = 0;
unsigned long ret_val = 0;
@@ -437,8 +438,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
end_pfn = PFN_UP(entry->addr);
if (start_pfn < end_pfn)
- ret_val = func(start_pfn, end_pfn, nr_pages,
- ret_val);
+ ret_val = func(start_pfn, end_pfn, ret_val);
start = end;
}
}
@@ -495,6 +495,8 @@ void __init xen_remap_memory(void)
set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
pr_info("Remapped %ld page(s)\n", remapped);
+
+ xen_do_remap_nonram();
}
static unsigned long __init xen_get_pages_limit(void)
@@ -568,7 +570,7 @@ static void __init xen_ignore_unusable(void)
}
}
-bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
+static bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
{
struct e820_entry *entry;
unsigned mapcnt;
@@ -626,6 +628,111 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size)
}
/*
+ * Swap a non-RAM E820 map entry with RAM above ini_nr_pages.
+ * Note that the E820 map is modified accordingly, but the P2M map isn't yet.
+ * The adaption of the P2M must be deferred until page allocation is possible.
+ */
+static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry)
+{
+ struct e820_entry *entry;
+ unsigned int mapcnt;
+ phys_addr_t mem_end = PFN_PHYS(ini_nr_pages);
+ phys_addr_t swap_addr, swap_size, entry_end;
+
+ swap_addr = PAGE_ALIGN_DOWN(swap_entry->addr);
+ swap_size = PAGE_ALIGN(swap_entry->addr - swap_addr + swap_entry->size);
+ entry = xen_e820_table.entries;
+
+ for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
+ entry_end = entry->addr + entry->size;
+ if (entry->type == E820_TYPE_RAM && entry->size >= swap_size &&
+ entry_end - swap_size >= mem_end) {
+ /* Reduce RAM entry by needed space (whole pages). */
+ entry->size -= swap_size;
+
+ /* Add new entry at the end of E820 map. */
+ entry = xen_e820_table.entries +
+ xen_e820_table.nr_entries;
+ xen_e820_table.nr_entries++;
+
+ /* Fill new entry (keep size and page offset). */
+ entry->type = swap_entry->type;
+ entry->addr = entry_end - swap_size +
+ swap_addr - swap_entry->addr;
+ entry->size = swap_entry->size;
+
+ /* Convert old entry to RAM, align to pages. */
+ swap_entry->type = E820_TYPE_RAM;
+ swap_entry->addr = swap_addr;
+ swap_entry->size = swap_size;
+
+ /* Remember PFN<->MFN relation for P2M update. */
+ xen_add_remap_nonram(swap_addr, entry_end - swap_size,
+ swap_size);
+
+ /* Order E820 table and merge entries. */
+ e820__update_table(&xen_e820_table);
+
+ return;
+ }
+
+ entry++;
+ }
+
+ xen_raw_console_write("No suitable area found for required E820 entry remapping action\n");
+ BUG();
+}
+
+/*
+ * Look for non-RAM memory types in a specific guest physical area and move
+ * those away if possible (ACPI NVS only for now).
+ */
+static void __init xen_e820_resolve_conflicts(phys_addr_t start,
+ phys_addr_t size)
+{
+ struct e820_entry *entry;
+ unsigned int mapcnt;
+ phys_addr_t end;
+
+ if (!size)
+ return;
+
+ end = start + size;
+ entry = xen_e820_table.entries;
+
+ for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
+ if (entry->addr >= end)
+ return;
+
+ if (entry->addr + entry->size > start &&
+ entry->type == E820_TYPE_NVS)
+ xen_e820_swap_entry_with_ram(entry);
+
+ entry++;
+ }
+}
+
+/*
+ * Check for an area in physical memory to be usable for non-movable purposes.
+ * An area is considered to usable if the used E820 map lists it to be RAM or
+ * some other type which can be moved to higher PFNs while keeping the MFNs.
+ * In case the area is not usable, crash the system with an error message.
+ */
+void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size,
+ const char *component)
+{
+ xen_e820_resolve_conflicts(start, size);
+
+ if (!xen_is_e820_reserved(start, size))
+ return;
+
+ xen_raw_console_write("Xen hypervisor allocated ");
+ xen_raw_console_write(component);
+ xen_raw_console_write(" memory conflicts with E820 map\n");
+ BUG();
+}
+
+/*
* Like memcpy, but with physical addresses for dest and src.
*/
static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
@@ -684,20 +791,20 @@ static void __init xen_reserve_xen_mfnlist(void)
**/
char * __init xen_memory_setup(void)
{
- unsigned long max_pfn, pfn_s, n_pfns;
+ unsigned long pfn_s, n_pfns;
phys_addr_t mem_end, addr, size, chunk_size;
u32 type;
int rc;
struct xen_memory_map memmap;
unsigned long max_pages;
unsigned long extra_pages = 0;
+ unsigned long maxmem_pages;
int i;
int op;
xen_parse_512gb();
- max_pfn = xen_get_pages_limit();
- max_pfn = min(max_pfn, xen_start_info->nr_pages);
- mem_end = PFN_PHYS(max_pfn);
+ ini_nr_pages = min(xen_get_pages_limit(), xen_start_info->nr_pages);
+ mem_end = PFN_PHYS(ini_nr_pages);
memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
@@ -747,13 +854,35 @@ char * __init xen_memory_setup(void)
/* Make sure the Xen-supplied memory map is well-ordered. */
e820__update_table(&xen_e820_table);
+ /*
+ * Check whether the kernel itself conflicts with the target E820 map.
+ * Failing now is better than running into weird problems later due
+ * to relocating (and even reusing) pages with kernel text or data.
+ */
+ xen_chk_is_e820_usable(__pa_symbol(_text),
+ __pa_symbol(_end) - __pa_symbol(_text),
+ "kernel");
+
+ /*
+ * Check for a conflict of the xen_start_info memory with the target
+ * E820 map.
+ */
+ xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info),
+ "xen_start_info");
+
+ /*
+ * Check for a conflict of the hypervisor supplied page tables with
+ * the target E820 map.
+ */
+ xen_pt_check_e820();
+
max_pages = xen_get_max_pages();
/* How many extra pages do we need due to remapping? */
- max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages);
+ max_pages += xen_foreach_remap_area(xen_count_remap_pages);
- if (max_pages > max_pfn)
- extra_pages += max_pages - max_pfn;
+ if (max_pages > ini_nr_pages)
+ extra_pages += max_pages - ini_nr_pages;
/*
* Clamp the amount of extra memory to a EXTRA_MEM_RATIO
@@ -762,8 +891,8 @@ char * __init xen_memory_setup(void)
* Make sure we have no memory above max_pages, as this area
* isn't handled by the p2m management.
*/
- extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
- extra_pages, max_pages - max_pfn);
+ maxmem_pages = EXTRA_MEM_RATIO * min(ini_nr_pages, PFN_DOWN(MAXMEM));
+ extra_pages = min3(maxmem_pages, extra_pages, max_pages - ini_nr_pages);
i = 0;
addr = xen_e820_table.entries[0].addr;
size = xen_e820_table.entries[0].size;
@@ -819,23 +948,6 @@ char * __init xen_memory_setup(void)
e820__update_table(e820_table);
- /*
- * Check whether the kernel itself conflicts with the target E820 map.
- * Failing now is better than running into weird problems later due
- * to relocating (and even reusing) pages with kernel text or data.
- */
- if (xen_is_e820_reserved(__pa_symbol(_text),
- __pa_symbol(__bss_stop) - __pa_symbol(_text))) {
- xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
- BUG();
- }
-
- /*
- * Check for a conflict of the hypervisor supplied page tables with
- * the target E820 map.
- */
- xen_pt_check_e820();
-
xen_reserve_xen_mfnlist();
/* Check for a conflict of the initrd with the target E820 map. */
@@ -863,7 +975,7 @@ char * __init xen_memory_setup(void)
* Set identity map on non-RAM pages and prepare remapping the
* underlying RAM.
*/
- xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk);
+ xen_foreach_remap_area(xen_set_identity_and_remap_chunk);
pr_info("Released %ld page(s)\n", xen_released_pages);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 935771726f9c..05f92c812ac8 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -9,7 +9,6 @@
#include <xen/hvc-console.h>
#include "xen-ops.h"
-#include "smp.h"
static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
deleted file mode 100644
index b8efdbc693f7..000000000000
--- a/arch/x86/xen/smp.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _XEN_SMP_H
-
-#ifdef CONFIG_SMP
-
-void asm_cpu_bringup_and_idle(void);
-asmlinkage void cpu_bringup_and_idle(void);
-
-extern void xen_send_IPI_mask(const struct cpumask *mask,
- int vector);
-extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
- int vector);
-extern void xen_send_IPI_allbutself(int vector);
-extern void xen_send_IPI_all(int vector);
-extern void xen_send_IPI_self(int vector);
-
-extern int xen_smp_intr_init(unsigned int cpu);
-extern void xen_smp_intr_free(unsigned int cpu);
-int xen_smp_intr_init_pv(unsigned int cpu);
-void xen_smp_intr_free_pv(unsigned int cpu);
-
-void xen_smp_count_cpus(void);
-void xen_smp_cpus_done(unsigned int max_cpus);
-
-void xen_smp_send_reschedule(int cpu);
-void xen_smp_send_call_function_ipi(const struct cpumask *mask);
-void xen_smp_send_call_function_single_ipi(int cpu);
-
-void __noreturn xen_cpu_bringup_again(unsigned long stack);
-
-struct xen_common_irq {
- int irq;
- char *name;
-};
-#else /* CONFIG_SMP */
-
-static inline int xen_smp_intr_init(unsigned int cpu)
-{
- return 0;
-}
-static inline void xen_smp_intr_free(unsigned int cpu) {}
-
-static inline int xen_smp_intr_init_pv(unsigned int cpu)
-{
- return 0;
-}
-static inline void xen_smp_intr_free_pv(unsigned int cpu) {}
-static inline void xen_smp_count_cpus(void) { }
-#endif /* CONFIG_SMP */
-
-#endif
diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
index ac95d1981cc0..485c1d8804f7 100644
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
@@ -5,8 +5,6 @@
#include <xen/events.h>
#include "xen-ops.h"
-#include "smp.h"
-
static void __init xen_hvm_smp_prepare_boot_cpu(void)
{
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index ac41d83b38d3..6863d3da7dec 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -46,9 +46,6 @@
#include <xen/hvc-console.h>
#include "xen-ops.h"
-#include "mmu.h"
-#include "smp.h"
-#include "pmu.h"
cpumask_var_t xen_cpu_initialized_map;
@@ -308,6 +305,7 @@ static int xen_pv_kick_ap(unsigned int cpu, struct task_struct *idle)
return rc;
xen_pmu_init(cpu);
+ mc_percpu_init(cpu);
/*
* Why is this a BUG? If the hypercall fails then everything can be
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 5c6fc16e4b92..8e4efe0fb6f9 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -18,7 +18,6 @@
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
static DEFINE_PER_CPU(char *, irq_name);
static DEFINE_PER_CPU(atomic_t, xen_qlock_wait_nest);
-static bool xen_pvspin = true;
static void xen_qlock_kick(int cpu)
{
@@ -68,7 +67,7 @@ void xen_init_lock_cpu(int cpu)
int irq;
char *name;
- if (!xen_pvspin)
+ if (nopvspin)
return;
WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n",
@@ -95,7 +94,7 @@ void xen_uninit_lock_cpu(int cpu)
{
int irq;
- if (!xen_pvspin)
+ if (nopvspin)
return;
kfree(per_cpu(irq_name, cpu));
@@ -125,10 +124,10 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
void __init xen_init_spinlocks(void)
{
/* Don't need to use pvqspinlock code if there is only 1 vCPU. */
- if (num_possible_cpus() == 1 || nopvspin)
- xen_pvspin = false;
+ if (num_possible_cpus() == 1)
+ nopvspin = true;
- if (!xen_pvspin) {
+ if (nopvspin) {
printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
static_branch_disable(&virt_spin_lock_key);
return;
@@ -143,12 +142,3 @@ void __init xen_init_spinlocks(void)
pv_ops.lock.kick = xen_qlock_kick;
pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
}
-
-static __init int xen_parse_nopvspin(char *arg)
-{
- pr_notice("\"xen_nopvspin\" is deprecated, please use \"nopvspin\" instead\n");
- xen_pvspin = false;
- return 0;
-}
-early_param("xen_nopvspin", xen_parse_nopvspin);
-
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 1d83152c761b..77a6ea1c60e4 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -15,8 +15,6 @@
#include <asm/fixmap.h>
#include "xen-ops.h"
-#include "mmu.h"
-#include "pmu.h"
static DEFINE_PER_CPU(u64, spec_ctrl);
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 52fa5609b7f6..96521b1874ac 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -30,7 +30,7 @@
#include "xen-ops.h"
/* Minimum amount of time until next clock event fires */
-#define TIMER_SLOP 100000
+#define TIMER_SLOP 1
static u64 xen_sched_clock_offset __read_mostly;
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 04101b984f24..758bcd47b72d 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -49,7 +49,7 @@ SYM_CODE_START(startup_xen)
ANNOTATE_NOENDBR
cld
- leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp
+ leaq __top_init_kernel_stack(%rip), %rsp
/* Set up %gs.
*
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 79cf93f2c92f..e1b782e823e6 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -5,8 +5,15 @@
#include <linux/init.h>
#include <linux/clocksource.h>
#include <linux/irqreturn.h>
+#include <linux/linkage.h>
+
+#include <xen/interface/xenpmu.h>
#include <xen/xen-ops.h>
+#include <asm/page.h>
+
+#include <trace/events/xen.h>
+
/* These are code, but not functions. Defined in entry.S */
extern const char xen_failsafe_callback[];
@@ -23,14 +30,11 @@ void xen_copy_trap_info(struct trap_info *traps);
DECLARE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info);
DECLARE_PER_CPU(unsigned long, xen_cr3);
-DECLARE_PER_CPU(unsigned long, xen_current_cr3);
extern struct start_info *xen_start_info;
extern struct shared_info xen_dummy_shared_info;
extern struct shared_info *HYPERVISOR_shared_info;
-extern bool xen_fifo_events;
-
void xen_setup_mfn_list_list(void);
void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
@@ -43,8 +47,12 @@ void xen_mm_unpin_all(void);
#ifdef CONFIG_X86_64
void __init xen_relocate_p2m(void);
#endif
+void __init xen_do_remap_nonram(void);
+void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr,
+ unsigned long size);
-bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size);
+void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size,
+ const char *component);
unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
void __init xen_inv_extra_mem(void);
void __init xen_remap_memory(void);
@@ -177,4 +185,145 @@ static inline void xen_hvm_post_suspend(int suspend_cancelled) {}
void xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns);
+struct dentry * __init xen_init_debugfs(void);
+
+enum pt_level {
+ PT_PGD,
+ PT_P4D,
+ PT_PUD,
+ PT_PMD,
+ PT_PTE
+};
+
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+unsigned long xen_read_cr2_direct(void);
+void xen_init_mmu_ops(void);
+void xen_hvm_init_mmu_ops(void);
+
+/* Multicalls */
+struct multicall_space
+{
+ struct multicall_entry *mc;
+ void *args;
+};
+
+/* Allocate room for a multicall and its args */
+struct multicall_space __xen_mc_entry(size_t args);
+
+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+/* Call to start a batch of multiple __xen_mc_entry()s. Must be
+ paired with xen_mc_issue() */
+static inline void xen_mc_batch(void)
+{
+ unsigned long flags;
+
+ /* need to disable interrupts until this entry is complete */
+ local_irq_save(flags);
+ trace_xen_mc_batch(xen_get_lazy_mode());
+ __this_cpu_write(xen_mc_irq_flags, flags);
+}
+
+static inline struct multicall_space xen_mc_entry(size_t args)
+{
+ xen_mc_batch();
+ return __xen_mc_entry(args);
+}
+
+/* Flush all pending multicalls */
+void xen_mc_flush(void);
+
+/* Issue a multicall if we're not in a lazy mode */
+static inline void xen_mc_issue(unsigned mode)
+{
+ trace_xen_mc_issue(mode);
+
+ if ((xen_get_lazy_mode() & mode) == 0)
+ xen_mc_flush();
+
+ /* restore flags saved in xen_mc_batch */
+ local_irq_restore(this_cpu_read(xen_mc_irq_flags));
+}
+
+/* Set up a callback to be called when the current batch is flushed */
+void xen_mc_callback(void (*fn)(void *), void *data);
+
+/*
+ * Try to extend the arguments of the previous multicall command. The
+ * previous command's op must match. If it does, then it attempts to
+ * extend the argument space allocated to the multicall entry by
+ * arg_size bytes.
+ *
+ * The returned multicall_space will return with mc pointing to the
+ * command on success, or NULL on failure, and args pointing to the
+ * newly allocated space.
+ */
+struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
+
+/* Do percpu data initialization for multicalls. */
+void mc_percpu_init(unsigned int cpu);
+
+extern bool is_xen_pmu;
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
+#ifdef CONFIG_XEN_HAVE_VPMU
+void xen_pmu_init(int cpu);
+void xen_pmu_finish(int cpu);
+#else
+static inline void xen_pmu_init(int cpu) {}
+static inline void xen_pmu_finish(int cpu) {}
+#endif
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
+int pmu_apic_update(uint32_t reg);
+unsigned long long xen_read_pmc(int counter);
+
+#ifdef CONFIG_SMP
+
+void asm_cpu_bringup_and_idle(void);
+asmlinkage void cpu_bringup_and_idle(void);
+
+extern void xen_send_IPI_mask(const struct cpumask *mask,
+ int vector);
+extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector);
+extern void xen_send_IPI_allbutself(int vector);
+extern void xen_send_IPI_all(int vector);
+extern void xen_send_IPI_self(int vector);
+
+extern int xen_smp_intr_init(unsigned int cpu);
+extern void xen_smp_intr_free(unsigned int cpu);
+int xen_smp_intr_init_pv(unsigned int cpu);
+void xen_smp_intr_free_pv(unsigned int cpu);
+
+void xen_smp_count_cpus(void);
+void xen_smp_cpus_done(unsigned int max_cpus);
+
+void xen_smp_send_reschedule(int cpu);
+void xen_smp_send_call_function_ipi(const struct cpumask *mask);
+void xen_smp_send_call_function_single_ipi(int cpu);
+
+void __noreturn xen_cpu_bringup_again(unsigned long stack);
+
+struct xen_common_irq {
+ int irq;
+ char *name;
+};
+#else /* CONFIG_SMP */
+
+static inline int xen_smp_intr_init(unsigned int cpu)
+{
+ return 0;
+}
+static inline void xen_smp_intr_free(unsigned int cpu) {}
+
+static inline int xen_smp_intr_init_pv(unsigned int cpu)
+{
+ return 0;
+}
+static inline void xen_smp_intr_free_pv(unsigned int cpu) {}
+static inline void xen_smp_count_cpus(void) { }
+#endif /* CONFIG_SMP */
+
#endif /* XEN_OPS_H */