summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoel Stanley <joel@jms.id.au>2018-08-22 11:16:27 +0300
committerJoel Stanley <joel@jms.id.au>2018-08-22 11:16:32 +0300
commit80d901318acc389e6e09698a8927bf6eb29da314 (patch)
tree60d9d330f12ab9c1525f6b8e5a46d875a1a6bf47
parentc71662c749dc66a542e717dbd51fefab995c9455 (diff)
parent97f21471e8e48fecb0b1c1b96fe934d1148dffe2 (diff)
downloadlinux-80d901318acc389e6e09698a8927bf6eb29da314.tar.xz
Merge tag 'v4.17.18' into dev-4.17
This is the 4.17.18 stable release Signed-off-by: Joel Stanley <joel@jms.id.au>
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu24
-rw-r--r--Documentation/admin-guide/index.rst9
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt78
-rw-r--r--Documentation/admin-guide/l1tf.rst610
-rw-r--r--Documentation/process/changes.rst19
-rw-r--r--Makefile2
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/arm/boot/dts/imx6sx.dtsi2
-rw-r--r--arch/arm64/mm/mmu.c4
-rw-r--r--arch/parisc/Kconfig2
-rw-r--r--arch/parisc/include/asm/barrier.h32
-rw-r--r--arch/parisc/kernel/entry.S2
-rw-r--r--arch/parisc/kernel/pacache.S1
-rw-r--r--arch/parisc/kernel/syscall.S4
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S2
-rw-r--r--arch/x86/include/asm/apic.h9
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/dmi.h2
-rw-r--r--arch/x86/include/asm/hardirq.h26
-rw-r--r--arch/x86/include/asm/i8259.h1
-rw-r--r--arch/x86/include/asm/irqflags.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h6
-rw-r--r--arch/x86/include/asm/msr-index.h7
-rw-r--r--arch/x86/include/asm/page_32_types.h9
-rw-r--r--arch/x86/include/asm/pgtable-2level.h17
-rw-r--r--arch/x86/include/asm/pgtable-3level.h37
-rw-r--r--arch/x86/include/asm/pgtable-invert.h41
-rw-r--r--arch/x86/include/asm/pgtable.h74
-rw-r--r--arch/x86/include/asm/pgtable_64.h38
-rw-r--r--arch/x86/include/asm/processor.h17
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/topology.h6
-rw-r--r--arch/x86/include/asm/vmx.h11
-rw-r--r--arch/x86/kernel/apic/apic.c18
-rw-r--r--arch/x86/kernel/apic/io_apic.c1
-rw-r--r--arch/x86/kernel/apic/msi.c1
-rw-r--r--arch/x86/kernel/apic/vector.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c4
-rw-r--r--arch/x86/kernel/cpu/amd.c59
-rw-r--r--arch/x86/kernel/cpu/bugs.c170
-rw-r--r--arch/x86/kernel/cpu/common.c65
-rw-r--r--arch/x86/kernel/cpu/cpu.h3
-rw-r--r--arch/x86/kernel/cpu/intel.c7
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c16
-rw-r--r--arch/x86/kernel/cpu/topology.c41
-rw-r--r--arch/x86/kernel/fpu/core.c1
-rw-r--r--arch/x86/kernel/hpet.c1
-rw-r--r--arch/x86/kernel/i8259.c1
-rw-r--r--arch/x86/kernel/idt.c1
-rw-r--r--arch/x86/kernel/irq.c1
-rw-r--r--arch/x86/kernel/irq_32.c1
-rw-r--r--arch/x86/kernel/irq_64.c1
-rw-r--r--arch/x86/kernel/irqinit.c1
-rw-r--r--arch/x86/kernel/kprobes/core.c5
-rw-r--r--arch/x86/kernel/paravirt.c14
-rw-r--r--arch/x86/kernel/setup.c6
-rw-r--r--arch/x86/kernel/smp.c1
-rw-r--r--arch/x86/kernel/smpboot.c25
-rw-r--r--arch/x86/kernel/time.c1
-rw-r--r--arch/x86/kvm/mmu.c1
-rw-r--r--arch/x86/kvm/vmx.c455
-rw-r--r--arch/x86/kvm/x86.c34
-rw-r--r--arch/x86/mm/init.c25
-rw-r--r--arch/x86/mm/kmmio.c25
-rw-r--r--arch/x86/mm/mmap.c21
-rw-r--r--arch/x86/mm/pageattr.c14
-rw-r--r--arch/x86/mm/pgtable.c61
-rw-r--r--arch/x86/mm/pti.c35
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c1
-rw-r--r--arch/x86/platform/uv/tlb_uv.c1
-rw-r--r--arch/x86/xen/enlighten.c1
-rw-r--r--arch/x86/xen/enlighten_pv.c3
-rw-r--r--crypto/ablkcipher.c57
-rw-r--r--crypto/blkcipher.c54
-rw-r--r--crypto/skcipher.c55
-rw-r--r--crypto/vmac.c412
-rw-r--r--drivers/acpi/sleep.c8
-rw-r--r--drivers/base/cpu.c8
-rw-r--r--drivers/block/zram/zram_drv.c15
-rw-r--r--drivers/crypto/ccp/psp-dev.c7
-rw-r--r--drivers/crypto/ccree/cc_cipher.c111
-rw-r--r--drivers/crypto/ccree/cc_hash.c81
-rw-r--r--drivers/gpu/drm/i915/i915_pmu.c1
-rw-r--r--drivers/gpu/drm/i915/intel_lpe_audio.c1
-rw-r--r--drivers/isdn/i4l/isdn_common.c8
-rw-r--r--drivers/misc/sram.c9
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c2
-rw-r--r--drivers/net/ethernet/marvell/mvneta.c53
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_tc.c8
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c51
-rw-r--r--drivers/net/ethernet/realtek/r8169.c12
-rw-r--r--drivers/net/xen-netfront.c8
-rw-r--r--drivers/pci/host/pci-hyperv.c2
-rw-r--r--drivers/scsi/qla2xxx/qla_iocb.c53
-rw-r--r--drivers/scsi/sr.c29
-rw-r--r--drivers/tty/serial/8250/8250_dw.c3
-rw-r--r--drivers/tty/serial/8250/8250_exar.c6
-rw-r--r--drivers/tty/serial/8250/8250_port.c3
-rw-r--r--drivers/usb/serial/option.c4
-rw-r--r--drivers/usb/serial/pl2303.c2
-rw-r--r--drivers/usb/serial/pl2303.h1
-rw-r--r--drivers/usb/serial/sierra.c4
-rw-r--r--drivers/vhost/vhost.c9
-rw-r--r--fs/dcache.c13
-rw-r--r--fs/namespace.c28
-rw-r--r--include/asm-generic/pgtable.h20
-rw-r--r--include/crypto/vmac.h63
-rw-r--r--include/linux/cpu.h23
-rw-r--r--include/linux/swapfile.h2
-rw-r--r--include/net/af_vsock.h4
-rw-r--r--include/net/llc.h5
-rw-r--r--init/main.c2
-rw-r--r--kernel/bpf/sockmap.c9
-rw-r--r--kernel/cpu.c284
-rw-r--r--kernel/sched/core.c30
-rw-r--r--kernel/sched/deadline.c8
-rw-r--r--kernel/sched/fair.c1
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c12
-rw-r--r--kernel/stop_machine.c10
-rw-r--r--lib/ioremap.c4
-rw-r--r--mm/memory.c37
-rw-r--r--mm/mprotect.c49
-rw-r--r--mm/swapfile.c46
-rw-r--r--net/bluetooth/hidp/core.c4
-rw-r--r--net/bluetooth/sco.c3
-rw-r--r--net/core/sock_diag.c2
-rw-r--r--net/dccp/ccids/ccid2.c6
-rw-r--r--net/ipv4/ip_vti.c3
-rw-r--r--net/ipv6/ip6_tunnel.c8
-rw-r--r--net/l2tp/l2tp_core.c2
-rw-r--r--net/llc/llc_core.c4
-rw-r--r--net/rxrpc/ar-internal.h8
-rw-r--r--net/rxrpc/conn_event.c4
-rw-r--r--net/rxrpc/net_ns.c6
-rw-r--r--net/rxrpc/output.c12
-rw-r--r--net/rxrpc/peer_event.c156
-rw-r--r--net/rxrpc/peer_object.c8
-rw-r--r--net/rxrpc/rxkad.c4
-rw-r--r--net/sched/cls_matchall.c2
-rw-r--r--net/sched/cls_tcindex.c8
-rw-r--r--net/socket.c3
-rw-r--r--net/vmw_vsock/af_vsock.c15
-rw-r--r--net/vmw_vsock/vmci_transport.c3
-rwxr-xr-xscripts/depmod.sh8
-rw-r--r--sound/core/memalloc.c8
-rw-r--r--sound/core/seq/oss/seq_oss.c2
-rw-r--r--sound/core/seq/seq_clientmgr.c2
-rw-r--r--sound/core/seq/seq_virmidi.c10
-rw-r--r--sound/pci/cs5535audio/cs5535audio.h6
-rw-r--r--sound/pci/cs5535audio/cs5535audio_pcm.c4
-rw-r--r--sound/pci/hda/hda_intel.c2
-rw-r--r--sound/pci/hda/patch_conexant.c4
-rw-r--r--sound/pci/vx222/vx222_ops.c8
-rw-r--r--sound/pcmcia/vx/vxp_ops.c10
-rw-r--r--tools/arch/x86/include/asm/cpufeatures.h23
-rw-r--r--tools/include/uapi/linux/prctl.h12
158 files changed, 3207 insertions, 1107 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index bd4975e132d3..6048a81fa744 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -479,6 +479,7 @@ What: /sys/devices/system/cpu/vulnerabilities
/sys/devices/system/cpu/vulnerabilities/spectre_v1
/sys/devices/system/cpu/vulnerabilities/spectre_v2
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
+ /sys/devices/system/cpu/vulnerabilities/l1tf
Date: January 2018
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Information about CPU vulnerabilities
@@ -490,3 +491,26 @@ Description: Information about CPU vulnerabilities
"Not affected" CPU is not affected by the vulnerability
"Vulnerable" CPU is affected and no mitigation in effect
"Mitigation: $M" CPU is affected and mitigation $M is in effect
+
+ Details about the l1tf file can be found in
+ Documentation/admin-guide/l1tf.rst
+
+What: /sys/devices/system/cpu/smt
+ /sys/devices/system/cpu/smt/active
+ /sys/devices/system/cpu/smt/control
+Date: June 2018
+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description: Control Symetric Multi Threading (SMT)
+
+ active: Tells whether SMT is active (enabled and siblings online)
+
+ control: Read/write interface to control SMT. Possible
+ values:
+
+ "on" SMT is enabled
+ "off" SMT is disabled
+ "forceoff" SMT is force disabled. Cannot be changed.
+ "notsupported" SMT is not supported by the CPU
+
+ If control status is "forceoff" or "notsupported" writes
+ are rejected.
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 5bb9161dbe6a..78f8f00c369f 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -17,6 +17,15 @@ etc.
kernel-parameters
devices
+This section describes CPU vulnerabilities and provides an overview of the
+possible mitigations along with guidance for selecting mitigations if they
+are configurable at compile, boot or run time.
+
+.. toctree::
+ :maxdepth: 1
+
+ l1tf
+
Here is a set of documents aimed at users who are trying to track down
problems and bugs in particular.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index ff4ba249a26f..d7dd58ccf0d4 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1958,10 +1958,84 @@
(virtualized real and unpaged mode) on capable
Intel chips. Default is 1 (enabled)
+ kvm-intel.vmentry_l1d_flush=[KVM,Intel] Mitigation for L1 Terminal Fault
+ CVE-2018-3620.
+
+ Valid arguments: never, cond, always
+
+ always: L1D cache flush on every VMENTER.
+ cond: Flush L1D on VMENTER only when the code between
+ VMEXIT and VMENTER can leak host memory.
+ never: Disables the mitigation
+
+ Default is cond (do L1 cache flush in specific instances)
+
kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification
feature (tagged TLBs) on capable Intel chips.
Default is 1 (enabled)
+ l1tf= [X86] Control mitigation of the L1TF vulnerability on
+ affected CPUs
+
+ The kernel PTE inversion protection is unconditionally
+ enabled and cannot be disabled.
+
+ full
+ Provides all available mitigations for the
+ L1TF vulnerability. Disables SMT and
+ enables all mitigations in the
+ hypervisors, i.e. unconditional L1D flush.
+
+ SMT control and L1D flush control via the
+ sysfs interface is still possible after
+ boot. Hypervisors will issue a warning
+ when the first VM is started in a
+ potentially insecure configuration,
+ i.e. SMT enabled or L1D flush disabled.
+
+ full,force
+ Same as 'full', but disables SMT and L1D
+ flush runtime control. Implies the
+ 'nosmt=force' command line option.
+ (i.e. sysfs control of SMT is disabled.)
+
+ flush
+ Leaves SMT enabled and enables the default
+ hypervisor mitigation, i.e. conditional
+ L1D flush.
+
+ SMT control and L1D flush control via the
+ sysfs interface is still possible after
+ boot. Hypervisors will issue a warning
+ when the first VM is started in a
+ potentially insecure configuration,
+ i.e. SMT enabled or L1D flush disabled.
+
+ flush,nosmt
+
+ Disables SMT and enables the default
+ hypervisor mitigation.
+
+ SMT control and L1D flush control via the
+ sysfs interface is still possible after
+ boot. Hypervisors will issue a warning
+ when the first VM is started in a
+ potentially insecure configuration,
+ i.e. SMT enabled or L1D flush disabled.
+
+ flush,nowarn
+ Same as 'flush', but hypervisors will not
+ warn when a VM is started in a potentially
+ insecure configuration.
+
+ off
+ Disables hypervisor mitigations and doesn't
+ emit any warnings.
+
+ Default is 'flush'.
+
+ For details see: Documentation/admin-guide/l1tf.rst
+
l2cr= [PPC]
l3cr= [PPC]
@@ -2675,6 +2749,10 @@
nosmt [KNL,S390] Disable symmetric multithreading (SMT).
Equivalent to smt=1.
+ [KNL,x86] Disable symmetric multithreading (SMT).
+ nosmt=force: Force disable SMT, cannot be undone
+ via the sysfs control file.
+
nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2
(indirect branch prediction) vulnerability. System may
allow data leaks with this option, which is equivalent
diff --git a/Documentation/admin-guide/l1tf.rst b/Documentation/admin-guide/l1tf.rst
new file mode 100644
index 000000000000..bae52b845de0
--- /dev/null
+++ b/Documentation/admin-guide/l1tf.rst
@@ -0,0 +1,610 @@
+L1TF - L1 Terminal Fault
+========================
+
+L1 Terminal Fault is a hardware vulnerability which allows unprivileged
+speculative access to data which is available in the Level 1 Data Cache
+when the page table entry controlling the virtual address, which is used
+for the access, has the Present bit cleared or other reserved bits set.
+
+Affected processors
+-------------------
+
+This vulnerability affects a wide range of Intel processors. The
+vulnerability is not present on:
+
+ - Processors from AMD, Centaur and other non Intel vendors
+
+ - Older processor models, where the CPU family is < 6
+
+ - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
+ Penwell, Pineview, Silvermont, Airmont, Merrifield)
+
+ - The Intel XEON PHI family
+
+ - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
+ IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
+ by the Meltdown vulnerability either. These CPUs should become
+ available by end of 2018.
+
+Whether a processor is affected or not can be read out from the L1TF
+vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
+
+Related CVEs
+------------
+
+The following CVE entries are related to the L1TF vulnerability:
+
+ ============= ================= ==============================
+ CVE-2018-3615 L1 Terminal Fault SGX related aspects
+ CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
+ CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
+ ============= ================= ==============================
+
+Problem
+-------
+
+If an instruction accesses a virtual address for which the relevant page
+table entry (PTE) has the Present bit cleared or other reserved bits set,
+then speculative execution ignores the invalid PTE and loads the referenced
+data if it is present in the Level 1 Data Cache, as if the page referenced
+by the address bits in the PTE was still present and accessible.
+
+While this is a purely speculative mechanism and the instruction will raise
+a page fault when it is retired eventually, the pure act of loading the
+data and making it available to other speculative instructions opens up the
+opportunity for side channel attacks to unprivileged malicious code,
+similar to the Meltdown attack.
+
+While Meltdown breaks the user space to kernel space protection, L1TF
+allows to attack any physical memory address in the system and the attack
+works across all protection domains. It allows an attack of SGX and also
+works from inside virtual machines because the speculation bypasses the
+extended page table (EPT) protection mechanism.
+
+
+Attack scenarios
+----------------
+
+1. Malicious user space
+^^^^^^^^^^^^^^^^^^^^^^^
+
+ Operating Systems store arbitrary information in the address bits of a
+ PTE which is marked non present. This allows a malicious user space
+ application to attack the physical memory to which these PTEs resolve.
+ In some cases user-space can maliciously influence the information
+ encoded in the address bits of the PTE, thus making attacks more
+ deterministic and more practical.
+
+ The Linux kernel contains a mitigation for this attack vector, PTE
+ inversion, which is permanently enabled and has no performance
+ impact. The kernel ensures that the address bits of PTEs, which are not
+ marked present, never point to cacheable physical memory space.
+
+ A system with an up to date kernel is protected against attacks from
+ malicious user space applications.
+
+2. Malicious guest in a virtual machine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ The fact that L1TF breaks all domain protections allows malicious guest
+ OSes, which can control the PTEs directly, and malicious guest user
+ space applications, which run on an unprotected guest kernel lacking the
+ PTE inversion mitigation for L1TF, to attack physical host memory.
+
+ A special aspect of L1TF in the context of virtualization is symmetric
+ multi threading (SMT). The Intel implementation of SMT is called
+ HyperThreading. The fact that Hyperthreads on the affected processors
+ share the L1 Data Cache (L1D) is important for this. As the flaw allows
+ only to attack data which is present in L1D, a malicious guest running
+ on one Hyperthread can attack the data which is brought into the L1D by
+ the context which runs on the sibling Hyperthread of the same physical
+ core. This context can be host OS, host user space or a different guest.
+
+ If the processor does not support Extended Page Tables, the attack is
+ only possible, when the hypervisor does not sanitize the content of the
+ effective (shadow) page tables.
+
+ While solutions exist to mitigate these attack vectors fully, these
+ mitigations are not enabled by default in the Linux kernel because they
+ can affect performance significantly. The kernel provides several
+ mechanisms which can be utilized to address the problem depending on the
+ deployment scenario. The mitigations, their protection scope and impact
+ are described in the next sections.
+
+ The default mitigations and the rationale for choosing them are explained
+ at the end of this document. See :ref:`default_mitigations`.
+
+.. _l1tf_sys_info:
+
+L1TF system information
+-----------------------
+
+The Linux kernel provides a sysfs interface to enumerate the current L1TF
+status of the system: whether the system is vulnerable, and which
+mitigations are active. The relevant sysfs file is:
+
+/sys/devices/system/cpu/vulnerabilities/l1tf
+
+The possible values in this file are:
+
+ =========================== ===============================
+ 'Not affected' The processor is not vulnerable
+ 'Mitigation: PTE Inversion' The host protection is active
+ =========================== ===============================
+
+If KVM/VMX is enabled and the processor is vulnerable then the following
+information is appended to the 'Mitigation: PTE Inversion' part:
+
+ - SMT status:
+
+ ===================== ================
+ 'VMX: SMT vulnerable' SMT is enabled
+ 'VMX: SMT disabled' SMT is disabled
+ ===================== ================
+
+ - L1D Flush mode:
+
+ ================================ ====================================
+ 'L1D vulnerable' L1D flushing is disabled
+
+ 'L1D conditional cache flushes' L1D flush is conditionally enabled
+
+ 'L1D cache flushes' L1D flush is unconditionally enabled
+ ================================ ====================================
+
+The resulting grade of protection is discussed in the following sections.
+
+
+Host mitigation mechanism
+-------------------------
+
+The kernel is unconditionally protected against L1TF attacks from malicious
+user space running on the host.
+
+
+Guest mitigation mechanisms
+---------------------------
+
+.. _l1d_flush:
+
+1. L1D flush on VMENTER
+^^^^^^^^^^^^^^^^^^^^^^^
+
+ To make sure that a guest cannot attack data which is present in the L1D
+ the hypervisor flushes the L1D before entering the guest.
+
+ Flushing the L1D evicts not only the data which should not be accessed
+ by a potentially malicious guest, it also flushes the guest
+ data. Flushing the L1D has a performance impact as the processor has to
+ bring the flushed guest data back into the L1D. Depending on the
+ frequency of VMEXIT/VMENTER and the type of computations in the guest
+ performance degradation in the range of 1% to 50% has been observed. For
+ scenarios where guest VMEXIT/VMENTER are rare the performance impact is
+ minimal. Virtio and mechanisms like posted interrupts are designed to
+ confine the VMEXITs to a bare minimum, but specific configurations and
+ application scenarios might still suffer from a high VMEXIT rate.
+
+ The kernel provides two L1D flush modes:
+ - conditional ('cond')
+ - unconditional ('always')
+
+ The conditional mode avoids L1D flushing after VMEXITs which execute
+ only audited code paths before the corresponding VMENTER. These code
+ paths have been verified that they cannot expose secrets or other
+ interesting data to an attacker, but they can leak information about the
+ address space layout of the hypervisor.
+
+ Unconditional mode flushes L1D on all VMENTER invocations and provides
+ maximum protection. It has a higher overhead than the conditional
+ mode. The overhead cannot be quantified correctly as it depends on the
+ workload scenario and the resulting number of VMEXITs.
+
+ The general recommendation is to enable L1D flush on VMENTER. The kernel
+ defaults to conditional mode on affected processors.
+
+ **Note**, that L1D flush does not prevent the SMT problem because the
+ sibling thread will also bring back its data into the L1D which makes it
+ attackable again.
+
+ L1D flush can be controlled by the administrator via the kernel command
+ line and sysfs control files. See :ref:`mitigation_control_command_line`
+ and :ref:`mitigation_control_kvm`.
+
+.. _guest_confinement:
+
+2. Guest VCPU confinement to dedicated physical cores
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ To address the SMT problem, it is possible to make a guest or a group of
+ guests affine to one or more physical cores. The proper mechanism for
+ that is to utilize exclusive cpusets to ensure that no other guest or
+ host tasks can run on these cores.
+
+ If only a single guest or related guests run on sibling SMT threads on
+ the same physical core then they can only attack their own memory and
+ restricted parts of the host memory.
+
+ Host memory is attackable, when one of the sibling SMT threads runs in
+ host OS (hypervisor) context and the other in guest context. The amount
+ of valuable information from the host OS context depends on the context
+ which the host OS executes, i.e. interrupts, soft interrupts and kernel
+ threads. The amount of valuable data from these contexts cannot be
+ declared as non-interesting for an attacker without deep inspection of
+ the code.
+
+ **Note**, that assigning guests to a fixed set of physical cores affects
+ the ability of the scheduler to do load balancing and might have
+ negative effects on CPU utilization depending on the hosting
+ scenario. Disabling SMT might be a viable alternative for particular
+ scenarios.
+
+ For further information about confining guests to a single or to a group
+ of cores consult the cpusets documentation:
+
+ https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
+
+.. _interrupt_isolation:
+
+3. Interrupt affinity
+^^^^^^^^^^^^^^^^^^^^^
+
+ Interrupts can be made affine to logical CPUs. This is not universally
+ true because there are types of interrupts which are truly per CPU
+ interrupts, e.g. the local timer interrupt. Aside of that multi queue
+ devices affine their interrupts to single CPUs or groups of CPUs per
+ queue without allowing the administrator to control the affinities.
+
+ Moving the interrupts, which can be affinity controlled, away from CPUs
+ which run untrusted guests, reduces the attack vector space.
+
+ Whether the interrupts with are affine to CPUs, which run untrusted
+ guests, provide interesting data for an attacker depends on the system
+ configuration and the scenarios which run on the system. While for some
+ of the interrupts it can be assumed that they won't expose interesting
+ information beyond exposing hints about the host OS memory layout, there
+ is no way to make general assumptions.
+
+ Interrupt affinity can be controlled by the administrator via the
+ /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
+ available at:
+
+ https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
+
+.. _smt_control:
+
+4. SMT control
+^^^^^^^^^^^^^^
+
+ To prevent the SMT issues of L1TF it might be necessary to disable SMT
+ completely. Disabling SMT can have a significant performance impact, but
+ the impact depends on the hosting scenario and the type of workloads.
+ The impact of disabling SMT needs also to be weighted against the impact
+ of other mitigation solutions like confining guests to dedicated cores.
+
+ The kernel provides a sysfs interface to retrieve the status of SMT and
+ to control it. It also provides a kernel command line interface to
+ control SMT.
+
+ The kernel command line interface consists of the following options:
+
+ =========== ==========================================================
+ nosmt Affects the bring up of the secondary CPUs during boot. The
+ kernel tries to bring all present CPUs online during the
+ boot process. "nosmt" makes sure that from each physical
+ core only one - the so called primary (hyper) thread is
+ activated. Due to a design flaw of Intel processors related
+ to Machine Check Exceptions the non primary siblings have
+ to be brought up at least partially and are then shut down
+ again. "nosmt" can be undone via the sysfs interface.
+
+ nosmt=force Has the same effect as "nosmt" but it does not allow to
+ undo the SMT disable via the sysfs interface.
+ =========== ==========================================================
+
+ The sysfs interface provides two files:
+
+ - /sys/devices/system/cpu/smt/control
+ - /sys/devices/system/cpu/smt/active
+
+ /sys/devices/system/cpu/smt/control:
+
+ This file allows to read out the SMT control state and provides the
+ ability to disable or (re)enable SMT. The possible states are:
+
+ ============== ===================================================
+ on SMT is supported by the CPU and enabled. All
+ logical CPUs can be onlined and offlined without
+ restrictions.
+
+ off SMT is supported by the CPU and disabled. Only
+ the so called primary SMT threads can be onlined
+ and offlined without restrictions. An attempt to
+ online a non-primary sibling is rejected
+
+ forceoff Same as 'off' but the state cannot be controlled.
+ Attempts to write to the control file are rejected.
+
+ notsupported The processor does not support SMT. It's therefore
+ not affected by the SMT implications of L1TF.
+ Attempts to write to the control file are rejected.
+ ============== ===================================================
+
+ The possible states which can be written into this file to control SMT
+ state are:
+
+ - on
+ - off
+ - forceoff
+
+ /sys/devices/system/cpu/smt/active:
+
+ This file reports whether SMT is enabled and active, i.e. if on any
+ physical core two or more sibling threads are online.
+
+ SMT control is also possible at boot time via the l1tf kernel command
+ line parameter in combination with L1D flush control. See
+ :ref:`mitigation_control_command_line`.
+
+5. Disabling EPT
+^^^^^^^^^^^^^^^^
+
+ Disabling EPT for virtual machines provides full mitigation for L1TF even
+ with SMT enabled, because the effective page tables for guests are
+ managed and sanitized by the hypervisor. Though disabling EPT has a
+ significant performance impact especially when the Meltdown mitigation
+ KPTI is enabled.
+
+ EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
+
+There is ongoing research and development for new mitigation mechanisms to
+address the performance impact of disabling SMT or EPT.
+
+.. _mitigation_control_command_line:
+
+Mitigation control on the kernel command line
+---------------------------------------------
+
+The kernel command line allows to control the L1TF mitigations at boot
+time with the option "l1tf=". The valid arguments for this option are:
+
+ ============ =============================================================
+ full Provides all available mitigations for the L1TF
+ vulnerability. Disables SMT and enables all mitigations in
+ the hypervisors, i.e. unconditional L1D flushing
+
+ SMT control and L1D flush control via the sysfs interface
+ is still possible after boot. Hypervisors will issue a
+ warning when the first VM is started in a potentially
+ insecure configuration, i.e. SMT enabled or L1D flush
+ disabled.
+
+ full,force Same as 'full', but disables SMT and L1D flush runtime
+ control. Implies the 'nosmt=force' command line option.
+ (i.e. sysfs control of SMT is disabled.)
+
+ flush Leaves SMT enabled and enables the default hypervisor
+ mitigation, i.e. conditional L1D flushing
+
+ SMT control and L1D flush control via the sysfs interface
+ is still possible after boot. Hypervisors will issue a
+ warning when the first VM is started in a potentially
+ insecure configuration, i.e. SMT enabled or L1D flush
+ disabled.
+
+ flush,nosmt Disables SMT and enables the default hypervisor mitigation,
+ i.e. conditional L1D flushing.
+
+ SMT control and L1D flush control via the sysfs interface
+ is still possible after boot. Hypervisors will issue a
+ warning when the first VM is started in a potentially
+ insecure configuration, i.e. SMT enabled or L1D flush
+ disabled.
+
+ flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
+ started in a potentially insecure configuration.
+
+ off Disables hypervisor mitigations and doesn't emit any
+ warnings.
+ ============ =============================================================
+
+The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
+
+
+.. _mitigation_control_kvm:
+
+Mitigation control for KVM - module parameter
+-------------------------------------------------------------
+
+The KVM hypervisor mitigation mechanism, flushing the L1D cache when
+entering a guest, can be controlled with a module parameter.
+
+The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
+following arguments:
+
+ ============ ==============================================================
+ always L1D cache flush on every VMENTER.
+
+ cond Flush L1D on VMENTER only when the code between VMEXIT and
+ VMENTER can leak host memory which is considered
+ interesting for an attacker. This still can leak host memory
+ which allows e.g. to determine the hosts address space layout.
+
+ never Disables the mitigation
+ ============ ==============================================================
+
+The parameter can be provided on the kernel command line, as a module
+parameter when loading the modules and at runtime modified via the sysfs
+file:
+
+/sys/module/kvm_intel/parameters/vmentry_l1d_flush
+
+The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
+line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
+module parameter is ignored and writes to the sysfs file are rejected.
+
+
+Mitigation selection guide
+--------------------------
+
+1. No virtualization in use
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ The system is protected by the kernel unconditionally and no further
+ action is required.
+
+2. Virtualization with trusted guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ If the guest comes from a trusted source and the guest OS kernel is
+ guaranteed to have the L1TF mitigations in place the system is fully
+ protected against L1TF and no further action is required.
+
+ To avoid the overhead of the default L1D flushing on VMENTER the
+ administrator can disable the flushing via the kernel command line and
+ sysfs control files. See :ref:`mitigation_control_command_line` and
+ :ref:`mitigation_control_kvm`.
+
+
+3. Virtualization with untrusted guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+3.1. SMT not supported or disabled
+""""""""""""""""""""""""""""""""""
+
+ If SMT is not supported by the processor or disabled in the BIOS or by
+ the kernel, it's only required to enforce L1D flushing on VMENTER.
+
+ Conditional L1D flushing is the default behaviour and can be tuned. See
+ :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
+
+3.2. EPT not supported or disabled
+""""""""""""""""""""""""""""""""""
+
+ If EPT is not supported by the processor or disabled in the hypervisor,
+ the system is fully protected. SMT can stay enabled and L1D flushing on
+ VMENTER is not required.
+
+ EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
+
+3.3. SMT and EPT supported and active
+"""""""""""""""""""""""""""""""""""""
+
+ If SMT and EPT are supported and active then various degrees of
+ mitigations can be employed:
+
+ - L1D flushing on VMENTER:
+
+ L1D flushing on VMENTER is the minimal protection requirement, but it
+ is only potent in combination with other mitigation methods.
+
+ Conditional L1D flushing is the default behaviour and can be tuned. See
+ :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
+
+ - Guest confinement:
+
+ Confinement of guests to a single or a group of physical cores which
+ are not running any other processes, can reduce the attack surface
+ significantly, but interrupts, soft interrupts and kernel threads can
+ still expose valuable data to a potential attacker. See
+ :ref:`guest_confinement`.
+
+ - Interrupt isolation:
+
+ Isolating the guest CPUs from interrupts can reduce the attack surface
+ further, but still allows a malicious guest to explore a limited amount
+ of host physical memory. This can at least be used to gain knowledge
+ about the host address space layout. The interrupts which have a fixed
+ affinity to the CPUs which run the untrusted guests can depending on
+ the scenario still trigger soft interrupts and schedule kernel threads
+ which might expose valuable information. See
+ :ref:`interrupt_isolation`.
+
+The above three mitigation methods combined can provide protection to a
+certain degree, but the risk of the remaining attack surface has to be
+carefully analyzed. For full protection the following methods are
+available:
+
+ - Disabling SMT:
+
+ Disabling SMT and enforcing the L1D flushing provides the maximum
+ amount of protection. This mitigation is not depending on any of the
+ above mitigation methods.
+
+ SMT control and L1D flushing can be tuned by the command line
+ parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
+ time with the matching sysfs control files. See :ref:`smt_control`,
+ :ref:`mitigation_control_command_line` and
+ :ref:`mitigation_control_kvm`.
+
+ - Disabling EPT:
+
+ Disabling EPT provides the maximum amount of protection as well. It is
+ not depending on any of the above mitigation methods. SMT can stay
+ enabled and L1D flushing is not required, but the performance impact is
+ significant.
+
+ EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
+ parameter.
+
+3.4. Nested virtual machines
+""""""""""""""""""""""""""""
+
+When nested virtualization is in use, three operating systems are involved:
+the bare metal hypervisor, the nested hypervisor and the nested virtual
+machine. VMENTER operations from the nested hypervisor into the nested
+guest will always be processed by the bare metal hypervisor. If KVM is the
+bare metal hypervisor it wiil:
+
+ - Flush the L1D cache on every switch from the nested hypervisor to the
+ nested virtual machine, so that the nested hypervisor's secrets are not
+ exposed to the nested virtual machine;
+
+ - Flush the L1D cache on every switch from the nested virtual machine to
+ the nested hypervisor; this is a complex operation, and flushing the L1D
+ cache avoids that the bare metal hypervisor's secrets are exposed to the
+ nested virtual machine;
+
+ - Instruct the nested hypervisor to not perform any L1D cache flush. This
+ is an optimization to avoid double L1D flushing.
+
+
+.. _default_mitigations:
+
+Default mitigations
+-------------------
+
+ The kernel default mitigations for vulnerable processors are:
+
+ - PTE inversion to protect against malicious user space. This is done
+ unconditionally and cannot be controlled.
+
+ - L1D conditional flushing on VMENTER when EPT is enabled for
+ a guest.
+
+ The kernel does not by default enforce the disabling of SMT, which leaves
+ SMT systems vulnerable when running untrusted guests with EPT enabled.
+
+ The rationale for this choice is:
+
+ - Force disabling SMT can break existing setups, especially with
+ unattended updates.
+
+ - If regular users run untrusted guests on their machine, then L1TF is
+ just an add on to other malware which might be embedded in an untrusted
+ guest, e.g. spam-bots or attacks on the local network.
+
+ There is no technical way to prevent a user from running untrusted code
+ on their machines blindly.
+
+ - It's technically extremely unlikely and from today's knowledge even
+ impossible that L1TF can be exploited via the most popular attack
+ mechanisms like JavaScript because these mechanisms have no way to
+ control PTEs. If this would be possible and not other mitigation would
+ be possible, then the default might be different.
+
+ - The administrators of cloud and hosting setups have to carefully
+ analyze the risk for their scenarios and make the appropriate
+ mitigation choices, which might even vary across their deployed
+ machines and also result in other changes of their overall setup.
+ There is no way for the kernel to provide a sensible default for this
+ kind of scenarios.
diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index ddc029734b25..005d8842a503 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -35,7 +35,7 @@ binutils 2.20 ld -v
flex 2.5.35 flex --version
bison 2.0 bison --version
util-linux 2.10o fdformat --version
-module-init-tools 0.9.10 depmod -V
+kmod 13 depmod -V
e2fsprogs 1.41.4 e2fsck -V
jfsutils 1.1.3 fsck.jfs -V
reiserfsprogs 3.6.3 reiserfsck -V
@@ -156,12 +156,6 @@ is not build with ``CONFIG_KALLSYMS`` and you have no way to rebuild and
reproduce the Oops with that option, then you can still decode that Oops
with ksymoops.
-Module-Init-Tools
------------------
-
-A new module loader is now in the kernel that requires ``module-init-tools``
-to use. It is backward compatible with the 2.4.x series kernels.
-
Mkinitrd
--------
@@ -371,16 +365,17 @@ Util-linux
- <https://www.kernel.org/pub/linux/utils/util-linux/>
+Kmod
+----
+
+- <https://www.kernel.org/pub/linux/utils/kernel/kmod/>
+- <https://git.kernel.org/pub/scm/utils/kernel/kmod/kmod.git>
+
Ksymoops
--------
- <https://www.kernel.org/pub/linux/utils/kernel/ksymoops/v2.4/>
-Module-Init-Tools
------------------
-
-- <https://www.kernel.org/pub/linux/utils/kernel/module-init-tools/>
-
Mkinitrd
--------
diff --git a/Makefile b/Makefile
index ce4248f558d1..429a1fe0b40b 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
VERSION = 4
PATCHLEVEL = 17
-SUBLEVEL = 14
+SUBLEVEL = 18
EXTRAVERSION =
NAME = Merciless Moray
diff --git a/arch/Kconfig b/arch/Kconfig
index 75dd23acf133..95ee27f372ed 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -13,6 +13,9 @@ config KEXEC_CORE
config HAVE_IMA_KEXEC
bool
+config HOTPLUG_SMT
+ bool
+
config OPROFILE
tristate "OProfile system profiling"
depends on PROFILING
diff --git a/arch/arm/boot/dts/imx6sx.dtsi b/arch/arm/boot/dts/imx6sx.dtsi
index 49c7205b8db8..77fdad65e2bb 100644
--- a/arch/arm/boot/dts/imx6sx.dtsi
+++ b/arch/arm/boot/dts/imx6sx.dtsi
@@ -1351,7 +1351,7 @@
ranges = <0x81000000 0 0 0x08f80000 0 0x00010000 /* downstream I/O */
0x82000000 0 0x08000000 0x08000000 0 0x00f00000>; /* non-prefetchable memory */
num-lanes = <1>;
- interrupts = <GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>;
+ interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "msi";
#interrupt-cells = <1>;
interrupt-map-mask = <0 0 0 0x7>;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 493ff75670ff..8ae5d7ae4af3 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -977,12 +977,12 @@ int pmd_clear_huge(pmd_t *pmdp)
return 1;
}
-int pud_free_pmd_page(pud_t *pud)
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
return pud_none(*pud);
}
-int pmd_free_pte_page(pmd_t *pmd)
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
return pmd_none(*pmd);
}
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index fc5a574c3482..f02087656528 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -199,7 +199,7 @@ config PREFETCH
config MLONGCALLS
bool "Enable the -mlong-calls compiler option for big kernels"
- def_bool y if (!MODULES)
+ default y
depends on PA8X00
help
If you configure the kernel to include many drivers built-in instead
diff --git a/arch/parisc/include/asm/barrier.h b/arch/parisc/include/asm/barrier.h
new file mode 100644
index 000000000000..dbaaca84f27f
--- /dev/null
+++ b/arch/parisc/include/asm/barrier.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_BARRIER_H
+#define __ASM_BARRIER_H
+
+#ifndef __ASSEMBLY__
+
+/* The synchronize caches instruction executes as a nop on systems in
+ which all memory references are performed in order. */
+#define synchronize_caches() __asm__ __volatile__ ("sync" : : : "memory")
+
+#if defined(CONFIG_SMP)
+#define mb() do { synchronize_caches(); } while (0)
+#define rmb() mb()
+#define wmb() mb()
+#define dma_rmb() mb()
+#define dma_wmb() mb()
+#else
+#define mb() barrier()
+#define rmb() barrier()
+#define wmb() barrier()
+#define dma_rmb() barrier()
+#define dma_wmb() barrier()
+#endif
+
+#define __smp_mb() mb()
+#define __smp_rmb() mb()
+#define __smp_wmb() mb()
+
+#include <asm-generic/barrier.h>
+
+#endif /* !__ASSEMBLY__ */
+#endif /* __ASM_BARRIER_H */
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index e95207c0565e..1b4732e20137 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -482,6 +482,8 @@
.macro tlb_unlock0 spc,tmp
#ifdef CONFIG_SMP
or,COND(=) %r0,\spc,%r0
+ sync
+ or,COND(=) %r0,\spc,%r0
stw \spc,0(\tmp)
#endif
.endm
diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S
index 22e6374ece44..97451e67d35b 100644
--- a/arch/parisc/kernel/pacache.S
+++ b/arch/parisc/kernel/pacache.S
@@ -353,6 +353,7 @@ ENDPROC_CFI(flush_data_cache_local)
.macro tlb_unlock la,flags,tmp
#ifdef CONFIG_SMP
ldi 1,\tmp
+ sync
stw \tmp,0(\la)
mtsm \flags
#endif
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index e775f80ae28c..4886a6db42e9 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -633,6 +633,7 @@ cas_action:
sub,<> %r28, %r25, %r0
2: stw,ma %r24, 0(%r26)
/* Free lock */
+ sync
stw,ma %r20, 0(%sr2,%r20)
#if ENABLE_LWS_DEBUG
/* Clear thread register indicator */
@@ -647,6 +648,7 @@ cas_action:
3:
/* Error occurred on load or store */
/* Free lock */
+ sync
stw %r20, 0(%sr2,%r20)
#if ENABLE_LWS_DEBUG
stw %r0, 4(%sr2,%r20)
@@ -848,6 +850,7 @@ cas2_action:
cas2_end:
/* Free lock */
+ sync
stw,ma %r20, 0(%sr2,%r20)
/* Enable interrupts */
ssm PSW_SM_I, %r0
@@ -858,6 +861,7 @@ cas2_end:
22:
/* Error occurred on load or store */
/* Free lock */
+ sync
stw %r20, 0(%sr2,%r20)
ssm PSW_SM_I, %r0
ldo 1(%r0),%r28
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c07f492b871a..960539ae701c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -183,6 +183,7 @@ config X86
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
+ select HOTPLUG_SMT if SMP
select IRQ_FORCED_THREADING
select PCI_LOCKLESS_CONFIG
select PERF_EVENTS
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
index 16c4ccb1f154..d2364c55bbde 100644
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
@@ -265,7 +265,7 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2)
vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
- vmovd _args_digest(state , idx, 4) , %xmm0
+ vmovd _args_digest+4*32(state, idx, 4), %xmm1
vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 74a9e06b6cfd..130e81e10fc7 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -10,6 +10,7 @@
#include <asm/fixmap.h>
#include <asm/mpspec.h>
#include <asm/msr.h>
+#include <asm/hardirq.h>
#define ARCH_APICTIMER_STOPS_ON_C3 1
@@ -502,12 +503,19 @@ extern int default_check_phys_apicid_present(int phys_apicid);
#endif /* CONFIG_X86_LOCAL_APIC */
+#ifdef CONFIG_SMP
+bool apic_id_is_primary_thread(unsigned int id);
+#else
+static inline bool apic_id_is_primary_thread(unsigned int id) { return false; }
+#endif
+
extern void irq_enter(void);
extern void irq_exit(void);
static inline void entering_irq(void)
{
irq_enter();
+ kvm_set_cpu_l1tf_flush_l1d();
}
static inline void entering_ack_irq(void)
@@ -520,6 +528,7 @@ static inline void ipi_entering_ack_irq(void)
{
irq_enter();
ack_APIC_irq();
+ kvm_set_cpu_l1tf_flush_l1d();
}
static inline void exiting_irq(void)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index fb00a2fca990..f8659f070fc6 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -219,6 +219,7 @@
#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
+#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -339,6 +340,7 @@
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
@@ -371,5 +373,6 @@
#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
+#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 0ab2ab27ad1f..b825cb201251 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -4,8 +4,8 @@
#include <linux/compiler.h>
#include <linux/init.h>
+#include <linux/io.h>
-#include <asm/io.h>
#include <asm/setup.h>
static __always_inline __init void *dmi_alloc(unsigned len)
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 5ea2afd4c871..0459169ab589 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -3,10 +3,12 @@
#define _ASM_X86_HARDIRQ_H
#include <linux/threads.h>
-#include <linux/irq.h>
typedef struct {
- unsigned int __softirq_pending;
+ u16 __softirq_pending;
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+ u8 kvm_cpu_l1tf_flush_l1d;
+#endif
unsigned int __nmi_count; /* arch dependent */
#ifdef CONFIG_X86_LOCAL_APIC
unsigned int apic_timer_irqs; /* arch dependent */
@@ -66,4 +68,24 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu);
extern u64 arch_irq_stat(void);
#define arch_irq_stat arch_irq_stat
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+static inline void kvm_set_cpu_l1tf_flush_l1d(void)
+{
+ __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
+}
+
+static inline void kvm_clear_cpu_l1tf_flush_l1d(void)
+{
+ __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0);
+}
+
+static inline bool kvm_get_cpu_l1tf_flush_l1d(void)
+{
+ return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d);
+}
+#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */
+static inline void kvm_set_cpu_l1tf_flush_l1d(void) { }
+#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */
+
#endif /* _ASM_X86_HARDIRQ_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 5cdcdbd4d892..89789e8c80f6 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -3,6 +3,7 @@
#define _ASM_X86_I8259_H
#include <linux/delay.h>
+#include <asm/io.h>
extern unsigned int cached_irq_mask;
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c4fc17220df9..c14f2a74b2be 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -13,6 +13,8 @@
* Interrupt control:
*/
+/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
+extern inline unsigned long native_save_fl(void);
extern inline unsigned long native_save_fl(void)
{
unsigned long flags;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f4b2588865e9..5d216d1f40a2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -17,6 +17,7 @@
#include <linux/tracepoint.h>
#include <linux/cpumask.h>
#include <linux/irq_work.h>
+#include <linux/irq.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
@@ -711,6 +712,9 @@ struct kvm_vcpu_arch {
/* be preempted when it's in kernel-mode(cpl=0) */
bool preempted_in_kernel;
+
+ /* Flush the L1 Data cache for L1TF mitigation on VMENTER */
+ bool l1tf_flush_l1d;
};
struct kvm_lpage_info {
@@ -879,6 +883,7 @@ struct kvm_vcpu_stat {
u64 signal_exits;
u64 irq_window_exits;
u64 nmi_window_exits;
+ u64 l1d_flush;
u64 halt_exits;
u64 halt_successful_poll;
u64 halt_attempted_poll;
@@ -1410,6 +1415,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
+u64 kvm_get_arch_capabilities(void);
void kvm_define_shared_msr(unsigned index, u32 msr);
int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index fda2114197b3..a7df14793e1d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -70,12 +70,19 @@
#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
+#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */
#define ARCH_CAP_SSB_NO (1 << 4) /*
* Not susceptible to Speculative Store Bypass
* attack, so no Speculative Store Bypass
* control required.
*/
+#define MSR_IA32_FLUSH_CMD 0x0000010b
+#define L1D_FLUSH (1 << 0) /*
+ * Writeback and invalidate the
+ * L1 data cache.
+ */
+
#define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index aa30c3241ea7..0d5c739eebd7 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -29,8 +29,13 @@
#define N_EXCEPTION_STACKS 1
#ifdef CONFIG_X86_PAE
-/* 44=32+12, the limit we can fit into an unsigned long pfn */
-#define __PHYSICAL_MASK_SHIFT 44
+/*
+ * This is beyond the 44 bit limit imposed by the 32bit long pfns,
+ * but we need the full mask to make sure inverted PROT_NONE
+ * entries have all the host bits set in a guest.
+ * The real limit is still 44 bits.
+ */
+#define __PHYSICAL_MASK_SHIFT 52
#define __VIRTUAL_MASK_SHIFT 32
#else /* !CONFIG_X86_PAE */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 685ffe8a0eaf..60d0f9015317 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -95,4 +95,21 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi
#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+/* No inverted PFNs on 2 level page tables */
+
+static inline u64 protnone_mask(u64 val)
+{
+ return 0;
+}
+
+static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
+{
+ return val;
+}
+
+static inline bool __pte_needs_invert(u64 val)
+{
+ return false;
+}
+
#endif /* _ASM_X86_PGTABLE_2LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index f24df59c40b2..bb035a4cbc8c 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -241,12 +241,43 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
#endif
/* Encode and de-code a swap entry */
+#define SWP_TYPE_BITS 5
+
+#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
+
+/* We always extract/encode the offset by shifting it all the way up, and then down again */
+#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS)
+
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
#define __swp_type(x) (((x).val) & 0x1f)
#define __swp_offset(x) ((x).val >> 5)
#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
-#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
-#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+
+/*
+ * Normally, __swp_entry() converts from arch-independent swp_entry_t to
+ * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result
+ * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the
+ * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to
+ * __swp_entry_to_pte() through the following helper macro based on 64bit
+ * __swp_entry().
+ */
+#define __swp_pteval_entry(type, offset) ((pteval_t) { \
+ (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
+ | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) })
+
+#define __swp_entry_to_pte(x) ((pte_t){ .pte = \
+ __swp_pteval_entry(__swp_type(x), __swp_offset(x)) })
+/*
+ * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent
+ * swp_entry_t, but also has to convert it from 64bit to the 32bit
+ * intermediate representation, using the following macros based on 64bit
+ * __swp_type() and __swp_offset().
+ */
+#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS)))
+#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT))
+
+#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \
+ __pteval_swp_offset(pte)))
#define gup_get_pte gup_get_pte
/*
@@ -295,4 +326,6 @@ static inline pte_t gup_get_pte(pte_t *ptep)
return pte;
}
+#include <asm/pgtable-invert.h>
+
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h
new file mode 100644
index 000000000000..a0c1525f1b6f
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-invert.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_PGTABLE_INVERT_H
+#define _ASM_PGTABLE_INVERT_H 1
+
+#ifndef __ASSEMBLY__
+
+/*
+ * A clear pte value is special, and doesn't get inverted.
+ *
+ * Note that even users that only pass a pgprot_t (rather
+ * than a full pte) won't trigger the special zero case,
+ * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED
+ * set. So the all zero case really is limited to just the
+ * cleared page table entry case.
+ */
+static inline bool __pte_needs_invert(u64 val)
+{
+ return val && !(val & _PAGE_PRESENT);
+}
+
+/* Get a mask to xor with the page table entry to get the correct pfn. */
+static inline u64 protnone_mask(u64 val)
+{
+ return __pte_needs_invert(val) ? ~0ull : 0;
+}
+
+static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
+{
+ /*
+ * When a PTE transitions from NONE to !NONE or vice-versa
+ * invert the PFN part to stop speculation.
+ * pte_pfn undoes this when needed.
+ */
+ if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
+ val = (val & ~mask) | (~val & mask);
+ return val;
+}
+
+#endif /* __ASSEMBLY__ */
+
+#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f1633de5a675..f3bbb6ea5937 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -185,19 +185,29 @@ static inline int pte_special(pte_t pte)
return pte_flags(pte) & _PAGE_SPECIAL;
}
+/* Entries that were set to PROT_NONE are inverted */
+
+static inline u64 protnone_mask(u64 val);
+
static inline unsigned long pte_pfn(pte_t pte)
{
- return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
+ phys_addr_t pfn = pte_val(pte);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
}
static inline unsigned long pmd_pfn(pmd_t pmd)
{
- return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
+ phys_addr_t pfn = pmd_val(pmd);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}
static inline unsigned long pud_pfn(pud_t pud)
{
- return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
+ phys_addr_t pfn = pud_val(pud);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}
static inline unsigned long p4d_pfn(p4d_t p4d)
@@ -400,11 +410,6 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_RW);
}
-static inline pmd_t pmd_mknotpresent(pmd_t pmd)
-{
- return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
-}
-
static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
pudval_t v = native_pud_val(pud);
@@ -459,11 +464,6 @@ static inline pud_t pud_mkwrite(pud_t pud)
return pud_set_flags(pud, _PAGE_RW);
}
-static inline pud_t pud_mknotpresent(pud_t pud)
-{
- return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE);
-}
-
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline int pte_soft_dirty(pte_t pte)
{
@@ -545,25 +545,45 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot)
static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
- return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
- check_pgprot(pgprot));
+ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PTE_PFN_MASK;
+ return __pte(pfn | check_pgprot(pgprot));
}
static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
- return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
- check_pgprot(pgprot));
+ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PHYSICAL_PMD_PAGE_MASK;
+ return __pmd(pfn | check_pgprot(pgprot));
}
static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
{
- return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) |
- check_pgprot(pgprot));
+ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PHYSICAL_PUD_PAGE_MASK;
+ return __pud(pfn | check_pgprot(pgprot));
}
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+ return pfn_pmd(pmd_pfn(pmd),
+ __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
+}
+
+static inline pud_t pud_mknotpresent(pud_t pud)
+{
+ return pfn_pud(pud_pfn(pud),
+ __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
+}
+
+static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
+
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
- pteval_t val = pte_val(pte);
+ pteval_t val = pte_val(pte), oldval = val;
/*
* Chop off the NX bit (if present), and add the NX portion of
@@ -571,17 +591,17 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
*/
val &= _PAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
-
+ val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
return __pte(val);
}
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
- pmdval_t val = pmd_val(pmd);
+ pmdval_t val = pmd_val(pmd), oldval = val;
val &= _HPAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
-
+ val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
return __pmd(val);
}
@@ -1320,6 +1340,14 @@ static inline bool pud_access_permitted(pud_t pud, bool write)
return __pte_access_permitted(pud_val(pud), write);
}
+#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
+extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);
+
+static inline bool arch_has_pfn_modify_check(void)
+{
+ return boot_cpu_has_bug(X86_BUG_L1TF);
+}
+
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 877bc27718ae..ea99272ab63e 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -273,7 +273,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
*
* | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
* | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
- * | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry
+ * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry
*
* G (8) is aliased and used as a PROT_NONE indicator for
* !present ptes. We need to start storing swap entries above
@@ -286,20 +286,34 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
*
* Bit 7 in swp entry should be 0 because pmd_present checks not only P,
* but also L and G.
+ *
+ * The offset is inverted by a binary not operation to make the high
+ * physical bits set.
*/
-#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
-#define SWP_TYPE_BITS 5
-/* Place the offset above the type: */
-#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS)
+#define SWP_TYPE_BITS 5
+
+#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
+
+/* We always extract/encode the offset by shifting it all the way up, and then down again */
+#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
-#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \
- & ((1U << SWP_TYPE_BITS) - 1))
-#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT)
-#define __swp_entry(type, offset) ((swp_entry_t) { \
- ((type) << (SWP_TYPE_FIRST_BIT)) \
- | ((offset) << SWP_OFFSET_FIRST_BIT) })
+/* Extract the high bits for type */
+#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))
+
+/* Shift up (to get rid of type), then down to get value */
+#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)
+
+/*
+ * Shift the offset up "too far" by TYPE bits, then down again
+ * The offset is inverted by a binary not operation to make the high
+ * physical bits set.
+ */
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
+ | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })
+
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
@@ -343,5 +357,7 @@ static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
return true;
}
+#include <asm/pgtable-invert.h>
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 21a114914ba4..d7a9dea8563d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -181,6 +181,11 @@ extern const struct seq_operations cpuinfo_op;
extern void cpu_detect(struct cpuinfo_x86 *c);
+static inline unsigned long l1tf_pfn_limit(void)
+{
+ return BIT(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1;
+}
+
extern void early_cpu_init(void);
extern void identify_boot_cpu(void);
extern void identify_secondary_cpu(struct cpuinfo_x86 *);
@@ -986,4 +991,16 @@ bool xen_set_default_idle(void);
void stop_this_cpu(void *dummy);
void df_debug(struct pt_regs *regs, long error_code);
void microcode_check(void);
+
+enum l1tf_mitigations {
+ L1TF_MITIGATION_OFF,
+ L1TF_MITIGATION_FLUSH_NOWARN,
+ L1TF_MITIGATION_FLUSH,
+ L1TF_MITIGATION_FLUSH_NOSMT,
+ L1TF_MITIGATION_FULL,
+ L1TF_MITIGATION_FULL_FORCE
+};
+
+extern enum l1tf_mitigations l1tf_mitigation;
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index f75bff8f9d82..547c4fe50711 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -171,7 +171,6 @@ static inline int wbinvd_on_all_cpus(void)
wbinvd();
return 0;
}
-#define smp_num_siblings 1
#endif /* CONFIG_SMP */
extern unsigned disabled_cpus;
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c1d2a9892352..453cf38a1c33 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -123,13 +123,17 @@ static inline int topology_max_smt_threads(void)
}
int topology_update_package_map(unsigned int apicid, unsigned int cpu);
-extern int topology_phys_to_logical_pkg(unsigned int pkg);
+int topology_phys_to_logical_pkg(unsigned int pkg);
+bool topology_is_primary_thread(unsigned int cpu);
+bool topology_smt_supported(void);
#else
#define topology_max_packages() (1)
static inline int
topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
static inline int topology_max_smt_threads(void) { return 1; }
+static inline bool topology_is_primary_thread(unsigned int cpu) { return true; }
+static inline bool topology_smt_supported(void) { return false; }
#endif
static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 7cc81c586d71..9b71d0d24db1 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -574,4 +574,15 @@ enum vm_instruction_error_number {
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
};
+enum vmx_l1d_flush_state {
+ VMENTER_L1D_FLUSH_AUTO,
+ VMENTER_L1D_FLUSH_NEVER,
+ VMENTER_L1D_FLUSH_COND,
+ VMENTER_L1D_FLUSH_ALWAYS,
+ VMENTER_L1D_FLUSH_EPT_DISABLED,
+ VMENTER_L1D_FLUSH_NOT_REQUIRED,
+};
+
+extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
+
#endif
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index adbda5847b14..3b3a2d0af78d 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -56,6 +56,7 @@
#include <asm/hypervisor.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
+#include <asm/irq_regs.h>
unsigned int num_processors;
@@ -2192,6 +2193,23 @@ static int cpuid_to_apicid[] = {
[0 ... NR_CPUS - 1] = -1,
};
+#ifdef CONFIG_SMP
+/**
+ * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
+ * @id: APIC ID to check
+ */
+bool apic_id_is_primary_thread(unsigned int apicid)
+{
+ u32 mask;
+
+ if (smp_num_siblings == 1)
+ return true;
+ /* Isolate the SMT bit(s) in the APICID and check for 0 */
+ mask = (1U << (fls(smp_num_siblings) - 1)) - 1;
+ return !(apicid & mask);
+}
+#endif
+
/*
* Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids
* and cpuid_to_apicid[] synchronized.
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3982f79d2377..ff0d14cd9e82 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -33,6 +33,7 @@
#include <linux/mm.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/sched.h>
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index ce503c99f5c4..72a94401f9e0 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -12,6 +12,7 @@
*/
#include <linux/mm.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/pci.h>
#include <linux/dmar.h>
#include <linux/hpet.h>
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index b708f597eee3..9f38b4140c27 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -11,6 +11,7 @@
* published by the Free Software Foundation.
*/
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/compiler.h>
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d492752f79e1..391f358ebb4c 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -394,10 +394,10 @@ extern int uv_hub_info_version(void)
EXPORT_SYMBOL(uv_hub_info_version);
/* Default UV memory block size is 2GB */
-static unsigned long mem_block_size = (2UL << 30);
+static unsigned long mem_block_size __initdata = (2UL << 30);
/* Kernel parameter to specify UV mem block size */
-static int parse_mem_block_size(char *ptr)
+static int __init parse_mem_block_size(char *ptr)
{
unsigned long size = memparse(ptr, NULL);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1b18be3f35a8..02fc277a56f2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -298,7 +298,6 @@ static int nearby_node(int apicid)
}
#endif
-#ifdef CONFIG_SMP
/*
* Fix up cpu_core_id for pre-F17h systems to be in the
* [0 .. cores_per_node - 1] range. Not really needed but
@@ -315,6 +314,13 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
c->cpu_core_id %= cus_per_node;
}
+
+static void amd_get_topology_early(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_TOPOEXT))
+ smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
+}
+
/*
* Fixup core topology information for
* (1) AMD multi-node processors
@@ -333,7 +339,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
node_id = ecx & 0xff;
- smp_num_siblings = ((ebx >> 8) & 0xff) + 1;
if (c->x86 == 0x15)
c->cu_id = ebx & 0xff;
@@ -376,7 +381,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
legacy_fixup_core_id(c);
}
}
-#endif
/*
* On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
@@ -384,7 +388,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
*/
static void amd_detect_cmp(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
unsigned bits;
int cpu = smp_processor_id();
@@ -396,16 +399,11 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
amd_get_topology(c);
-#endif
}
u16 amd_get_nb_id(int cpu)
{
- u16 id = 0;
-#ifdef CONFIG_SMP
- id = per_cpu(cpu_llc_id, cpu);
-#endif
- return id;
+ return per_cpu(cpu_llc_id, cpu);
}
EXPORT_SYMBOL_GPL(amd_get_nb_id);
@@ -624,6 +622,7 @@ clear_sev:
static void early_init_amd(struct cpuinfo_x86 *c)
{
+ u64 value;
u32 dummy;
early_init_amd_mc(c);
@@ -694,6 +693,22 @@ static void early_init_amd(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_E400);
early_detect_mem_encrypt(c);
+
+ /* Re-enable TopologyExtensions if switched off by BIOS */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x6f) &&
+ !cpu_has(c, X86_FEATURE_TOPOEXT)) {
+
+ if (msr_set_bit(0xc0011005, 54) > 0) {
+ rdmsrl(0xc0011005, value);
+ if (value & BIT_64(54)) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
+ }
+ }
+ }
+
+ amd_get_topology_early(c);
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -785,19 +800,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
{
u64 value;
- /* re-enable TopologyExtensions if switched off by BIOS */
- if ((c->x86_model >= 0x10) && (c->x86_model <= 0x6f) &&
- !cpu_has(c, X86_FEATURE_TOPOEXT)) {
-
- if (msr_set_bit(0xc0011005, 54) > 0) {
- rdmsrl(0xc0011005, value);
- if (value & BIT_64(54)) {
- set_cpu_cap(c, X86_FEATURE_TOPOEXT);
- pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
- }
- }
- }
-
/*
* The way access filter has a performance penalty on some workloads.
* Disable it on the affected CPUs.
@@ -861,15 +863,8 @@ static void init_amd(struct cpuinfo_x86 *c)
cpu_detect_cache_sizes(c);
- /* Multi core CPU? */
- if (c->extended_cpuid_level >= 0x80000008) {
- amd_detect_cmp(c);
- srat_detect_node(c);
- }
-
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
+ amd_detect_cmp(c);
+ srat_detect_node(c);
init_amd_cacheinfo(c);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 7416fc206b4a..d07addb99b71 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -22,14 +22,17 @@
#include <asm/processor-flags.h>
#include <asm/fpu/internal.h>
#include <asm/msr.h>
+#include <asm/vmx.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/intel-family.h>
+#include <asm/e820/api.h>
static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
+static void __init l1tf_select_mitigation(void);
/*
* Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
@@ -55,6 +58,12 @@ void __init check_bugs(void)
{
identify_boot_cpu();
+ /*
+ * identify_boot_cpu() initialized SMT support information, let the
+ * core code know.
+ */
+ cpu_smt_check_topology_early();
+
if (!IS_ENABLED(CONFIG_SMP)) {
pr_info("CPU: ");
print_cpu_info(&boot_cpu_data);
@@ -81,6 +90,8 @@ void __init check_bugs(void)
*/
ssb_select_mitigation();
+ l1tf_select_mitigation();
+
#ifdef CONFIG_X86_32
/*
* Check whether we are able to run this kernel safely on SMP.
@@ -311,23 +322,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return cmd;
}
-/* Check for Skylake-like CPUs (for RSB handling) */
-static bool __init is_skylake_era(void)
-{
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6) {
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_SKYLAKE_MOBILE:
- case INTEL_FAM6_SKYLAKE_DESKTOP:
- case INTEL_FAM6_SKYLAKE_X:
- case INTEL_FAM6_KABYLAKE_MOBILE:
- case INTEL_FAM6_KABYLAKE_DESKTOP:
- return true;
- }
- }
- return false;
-}
-
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -388,22 +382,15 @@ retpoline_auto:
pr_info("%s\n", spectre_v2_strings[mode]);
/*
- * If neither SMEP nor PTI are available, there is a risk of
- * hitting userspace addresses in the RSB after a context switch
- * from a shallow call stack to a deeper one. To prevent this fill
- * the entire RSB, even when using IBRS.
+ * If spectre v2 protection has been enabled, unconditionally fill
+ * RSB during a context switch; this protects against two independent
+ * issues:
*
- * Skylake era CPUs have a separate issue with *underflow* of the
- * RSB, when they will predict 'ret' targets from the generic BTB.
- * The proper mitigation for this is IBRS. If IBRS is not supported
- * or deactivated in favour of retpolines the RSB fill on context
- * switch is required.
+ * - RSB underflow (and switch to BTB) on Skylake+
+ * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs
*/
- if ((!boot_cpu_has(X86_FEATURE_PTI) &&
- !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
- setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
- pr_info("Spectre v2 mitigation: Filling RSB on context switch\n");
- }
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
/* Initialize Indirect Branch Prediction Barrier if supported */
if (boot_cpu_has(X86_FEATURE_IBPB)) {
@@ -654,8 +641,120 @@ void x86_spec_ctrl_setup_ap(void)
x86_amd_ssb_disable();
}
+#undef pr_fmt
+#define pr_fmt(fmt) "L1TF: " fmt
+
+/* Default mitigation for L1TF-affected CPUs */
+enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH;
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+EXPORT_SYMBOL_GPL(l1tf_mitigation);
+#endif
+enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
+
+static void __init l1tf_select_mitigation(void)
+{
+ u64 half_pa;
+
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return;
+
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ break;
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ cpu_smt_disable(false);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ cpu_smt_disable(true);
+ break;
+ }
+
+#if CONFIG_PGTABLE_LEVELS == 2
+ pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
+ return;
+#endif
+
+ /*
+ * This is extremely unlikely to happen because almost all
+ * systems have far more MAX_PA/2 than RAM can be fit into
+ * DIMM slots.
+ */
+ half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
+ if (e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) {
+ pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
+ return;
+ }
+
+ setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
+}
+
+static int __init l1tf_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ else if (!strcmp(str, "flush,nowarn"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN;
+ else if (!strcmp(str, "flush"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ else if (!strcmp(str, "flush,nosmt"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else if (!strcmp(str, "full"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL;
+ else if (!strcmp(str, "full,force"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE;
+
+ return 0;
+}
+early_param("l1tf", l1tf_cmdline);
+
+#undef pr_fmt
+
#ifdef CONFIG_SYSFS
+#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+static const char *l1tf_vmx_states[] = {
+ [VMENTER_L1D_FLUSH_AUTO] = "auto",
+ [VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
+ [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
+ [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes",
+ [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled",
+ [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary"
+};
+
+static ssize_t l1tf_show_state(char *buf)
+{
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
+ return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
+ (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
+ cpu_smt_control == CPU_SMT_ENABLED))
+ return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation]);
+
+ return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation],
+ cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled");
+}
+#else
+static ssize_t l1tf_show_state(char *buf)
+{
+ return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+}
+#endif
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -681,6 +780,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_SPEC_STORE_BYPASS:
return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+ case X86_BUG_L1TF:
+ if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
+ return l1tf_show_state(buf);
+ break;
default:
break;
}
@@ -707,4 +810,9 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *
{
return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
}
+
+ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 38276f58d3bf..ae4f5afb9095 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -66,6 +66,13 @@ cpumask_var_t cpu_callin_mask;
/* representing cpus for which sibling maps can be computed */
cpumask_var_t cpu_sibling_setup_mask;
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+EXPORT_SYMBOL(smp_num_siblings);
+
+/* Last level cache ID of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
+
/* correctly size the local cpu masks */
void __init setup_cpu_local_masks(void)
{
@@ -638,33 +645,36 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
}
-void detect_ht(struct cpuinfo_x86 *c)
+int detect_ht_early(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
u32 eax, ebx, ecx, edx;
- int index_msb, core_bits;
- static bool printed;
if (!cpu_has(c, X86_FEATURE_HT))
- return;
+ return -1;
if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
- goto out;
+ return -1;
if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
- return;
+ return -1;
cpuid(1, &eax, &ebx, &ecx, &edx);
smp_num_siblings = (ebx & 0xff0000) >> 16;
-
- if (smp_num_siblings == 1) {
+ if (smp_num_siblings == 1)
pr_info_once("CPU0: Hyper-Threading is disabled\n");
- goto out;
- }
+#endif
+ return 0;
+}
- if (smp_num_siblings <= 1)
- goto out;
+void detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ int index_msb, core_bits;
+
+ if (detect_ht_early(c) < 0)
+ return;
index_msb = get_count_order(smp_num_siblings);
c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
@@ -677,15 +687,6 @@ void detect_ht(struct cpuinfo_x86 *c)
c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
((1 << core_bits) - 1);
-
-out:
- if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
- pr_info("CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- pr_info("CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- printed = 1;
- }
#endif
}
@@ -882,7 +883,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
apply_forced_caps(c);
}
-static void get_cpu_address_sizes(struct cpuinfo_x86 *c)
+void get_cpu_address_sizes(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
@@ -958,6 +959,21 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
{}
};
+static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
+ /* in addition to cpu_no_speculation */
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
+ {}
+};
+
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 ia32_cap = 0;
@@ -983,6 +999,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
return;
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
+
+ if (x86_match_cpu(cpu_no_l1tf))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_L1TF);
}
/*
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 37672d299e35..d0a17a438dd0 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -46,7 +46,10 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
extern void get_cpu_cap(struct cpuinfo_x86 *c);
+extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
+extern int detect_ht_early(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 577e7f7ae273..bfaaa92ef6de 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -301,6 +301,13 @@ static void early_init_intel(struct cpuinfo_x86 *c)
}
check_mpx_erratum(c);
+
+ /*
+ * Get the number of SMT siblings early from the extended topology
+ * leaf, if available. Otherwise try the legacy SMT detection.
+ */
+ if (detect_extended_topology_early(c) < 0)
+ detect_ht_early(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 08286269fd24..b9bc8a1a584e 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -509,12 +509,20 @@ static struct platform_device *microcode_pdev;
static int check_online_cpus(void)
{
- if (num_online_cpus() == num_present_cpus())
- return 0;
+ unsigned int cpu;
- pr_err("Not all CPUs online, aborting microcode update.\n");
+ /*
+ * Make sure all CPUs are online. It's fine for SMT to be disabled if
+ * all the primary threads are still online.
+ */
+ for_each_present_cpu(cpu) {
+ if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) {
+ pr_err("Not all CPUs online, aborting microcode update.\n");
+ return -EINVAL;
+ }
+ }
- return -EINVAL;
+ return 0;
}
static atomic_t late_cpus_in;
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index b099024d339c..19c6e800e816 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -27,16 +27,13 @@
* exists, use it for populating initial_apicid and cpu topology
* detection.
*/
-void detect_extended_topology(struct cpuinfo_x86 *c)
+int detect_extended_topology_early(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
- unsigned int eax, ebx, ecx, edx, sub_index;
- unsigned int ht_mask_width, core_plus_mask_width;
- unsigned int core_select_mask, core_level_siblings;
- static bool printed;
+ unsigned int eax, ebx, ecx, edx;
if (c->cpuid_level < 0xb)
- return;
+ return -1;
cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
@@ -44,7 +41,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
* check if the cpuid leaf 0xb is actually implemented.
*/
if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
- return;
+ return -1;
set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
@@ -52,10 +49,30 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
* initial apic id, which also represents 32-bit extended x2apic id.
*/
c->initial_apicid = edx;
+ smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+#endif
+ return 0;
+}
+
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+void detect_extended_topology(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned int eax, ebx, ecx, edx, sub_index;
+ unsigned int ht_mask_width, core_plus_mask_width;
+ unsigned int core_select_mask, core_level_siblings;
+
+ if (detect_extended_topology_early(c) < 0)
+ return;
/*
* Populate HT related information from sub-leaf level 0.
*/
+ cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
@@ -86,15 +103,5 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
c->x86_max_cores = (core_level_siblings / smp_num_siblings);
-
- if (!printed) {
- pr_info("CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- if (c->x86_max_cores > 1)
- pr_info("CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- printed = 1;
- }
- return;
#endif
}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index f92a6593de1e..2ea85b32421a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -10,6 +10,7 @@
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
+#include <asm/irq_regs.h>
#include <linux/hardirq.h>
#include <linux/pkeys.h>
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 8ce4212e2b8d..afa1a204bc6d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,6 +1,7 @@
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/export.h>
#include <linux/delay.h>
#include <linux/errno.h>
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 86c4439f9d74..519649ddf100 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/timex.h>
#include <linux/random.h>
#include <linux/init.h>
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 2c3a1b4294eb..7f6cffaa5322 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -8,6 +8,7 @@
#include <asm/traps.h>
#include <asm/proto.h>
#include <asm/desc.h>
+#include <asm/hw_irq.h>
struct idt_data {
unsigned int vector;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 328d027d829d..59b5f2ea7c2f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -10,6 +10,7 @@
#include <linux/ftrace.h>
#include <linux/delay.h>
#include <linux/export.h>
+#include <linux/irq.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index c1bdbd3d3232..95600a99ae93 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -11,6 +11,7 @@
#include <linux/seq_file.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/kernel_stat.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index d86e344f5b3d..0469cd078db1 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -11,6 +11,7 @@
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/seq_file.h>
#include <linux/delay.h>
#include <linux/ftrace.h>
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 772196c1b8c4..a0693b71cfc1 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/timex.h>
#include <linux/random.h>
#include <linux/kprobes.h>
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6f4d42377fe5..44e26dc326d5 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -395,8 +395,6 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
- (u8 *) real;
if ((s64) (s32) newdisp != newdisp) {
pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
- pr_err("\tSrc: %p, Dest: %p, old disp: %x\n",
- src, real, insn->displacement.value);
return 0;
}
disp = (u8 *) dest + insn_offset_displacement(insn);
@@ -640,8 +638,7 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
* Raise a BUG or we'll continue in an endless reentering loop
* and eventually a stack overflow.
*/
- printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
- p->addr);
+ pr_err("Unrecoverable kprobe detected.\n");
dump_kprobe(p);
BUG();
default:
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 99dc79e76bdc..930c88341e4e 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -88,10 +88,12 @@ unsigned paravirt_patch_call(void *insnbuf,
struct branch *b = insnbuf;
unsigned long delta = (unsigned long)target - (addr+5);
- if (tgt_clobbers & ~site_clobbers)
- return len; /* target would clobber too much for this site */
- if (len < 5)
+ if (len < 5) {
+#ifdef CONFIG_RETPOLINE
+ WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr);
+#endif
return len; /* call too long for patch site */
+ }
b->opcode = 0xe8; /* call */
b->delta = delta;
@@ -106,8 +108,12 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
struct branch *b = insnbuf;
unsigned long delta = (unsigned long)target - (addr+5);
- if (len < 5)
+ if (len < 5) {
+#ifdef CONFIG_RETPOLINE
+ WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr);
+#endif
return len; /* call too long for patch site */
+ }
b->opcode = 0xe9; /* jmp */
b->delta = delta;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5c623dfe39d1..89fd35349412 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -823,6 +823,12 @@ void __init setup_arch(char **cmdline_p)
memblock_reserve(__pa_symbol(_text),
(unsigned long)__bss_stop - (unsigned long)_text);
+ /*
+ * Make sure page 0 is always reserved because on systems with
+ * L1TF its contents can be leaked to user processes.
+ */
+ memblock_reserve(0, PAGE_SIZE);
+
early_reserve_initrd();
/*
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 5c574dff4c1a..04adc8d60aed 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -261,6 +261,7 @@ __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
inc_irq_stat(irq_resched_count);
+ kvm_set_cpu_l1tf_flush_l1d();
if (trace_resched_ipi_enabled()) {
/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9dd324ae4832..f5d30c68fd09 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -80,13 +80,7 @@
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
#include <asm/spec-ctrl.h>
-
-/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-EXPORT_SYMBOL(smp_num_siblings);
-
-/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
+#include <asm/hw_irq.h>
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
@@ -273,6 +267,23 @@ static void notrace start_secondary(void *unused)
}
/**
+ * topology_is_primary_thread - Check whether CPU is the primary SMT thread
+ * @cpu: CPU to check
+ */
+bool topology_is_primary_thread(unsigned int cpu)
+{
+ return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu));
+}
+
+/**
+ * topology_smt_supported - Check whether SMT is supported by the CPUs
+ */
+bool topology_smt_supported(void)
+{
+ return smp_num_siblings > 1;
+}
+
+/**
* topology_phys_to_logical_pkg - Map a physical package id to a logical
*
* Returns logical package id or -1 if not found
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 774ebafa97c4..be01328eb755 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -12,6 +12,7 @@
#include <linux/clockchips.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <linux/i8253.h>
#include <linux/time.h>
#include <linux/export.h>
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 030c6bb240d9..2b974d4e1489 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3836,6 +3836,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
{
int r = 1;
+ vcpu->arch.l1tf_flush_l1d = true;
switch (vcpu->arch.apf.host_apf_reason) {
default:
trace_kvm_page_fault(fault_address, error_code);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7a28959f1985..12cad70acc3b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -188,6 +188,150 @@ module_param(ple_window_max, uint, 0444);
extern const ulong vmx_return;
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
+static DEFINE_MUTEX(vmx_l1d_flush_mutex);
+
+/* Storage for pre module init parameter parsing */
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
+
+static const struct {
+ const char *option;
+ enum vmx_l1d_flush_state cmd;
+} vmentry_l1d_param[] = {
+ {"auto", VMENTER_L1D_FLUSH_AUTO},
+ {"never", VMENTER_L1D_FLUSH_NEVER},
+ {"cond", VMENTER_L1D_FLUSH_COND},
+ {"always", VMENTER_L1D_FLUSH_ALWAYS},
+};
+
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+{
+ struct page *page;
+ unsigned int i;
+
+ if (!enable_ept) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+ return 0;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+ u64 msr;
+
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+ if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+ }
+
+ /* If set to auto use the default l1tf mitigation method */
+ if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ l1tf = VMENTER_L1D_FLUSH_NEVER;
+ break;
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ l1tf = VMENTER_L1D_FLUSH_COND;
+ break;
+ case L1TF_MITIGATION_FULL:
+ case L1TF_MITIGATION_FULL_FORCE:
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ break;
+ }
+ } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ }
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
+ !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+ if (!page)
+ return -ENOMEM;
+ vmx_l1d_flush_pages = page_address(page);
+
+ /*
+ * Initialize each page with a different pattern in
+ * order to protect against KSM in the nested
+ * virtualization case.
+ */
+ for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
+ memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
+ PAGE_SIZE);
+ }
+ }
+
+ l1tf_vmx_mitigation = l1tf;
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+ static_branch_enable(&vmx_l1d_should_flush);
+ else
+ static_branch_disable(&vmx_l1d_should_flush);
+
+ if (l1tf == VMENTER_L1D_FLUSH_COND)
+ static_branch_enable(&vmx_l1d_flush_cond);
+ else
+ static_branch_disable(&vmx_l1d_flush_cond);
+ return 0;
+}
+
+static int vmentry_l1d_flush_parse(const char *s)
+{
+ unsigned int i;
+
+ if (s) {
+ for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+ if (sysfs_streq(s, vmentry_l1d_param[i].option))
+ return vmentry_l1d_param[i].cmd;
+ }
+ }
+ return -EINVAL;
+}
+
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ int l1tf, ret;
+
+ if (!boot_cpu_has(X86_BUG_L1TF))
+ return 0;
+
+ l1tf = vmentry_l1d_flush_parse(s);
+ if (l1tf < 0)
+ return l1tf;
+
+ /*
+ * Has vmx_init() run already? If not then this is the pre init
+ * parameter parsing. In that case just store the value and let
+ * vmx_init() do the proper setup after enable_ept has been
+ * established.
+ */
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
+ vmentry_l1d_flush_param = l1tf;
+ return 0;
+ }
+
+ mutex_lock(&vmx_l1d_flush_mutex);
+ ret = vmx_setup_l1d_flush(l1tf);
+ mutex_unlock(&vmx_l1d_flush_mutex);
+ return ret;
+}
+
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
+}
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+ .set = vmentry_l1d_flush_set,
+ .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+
struct kvm_vmx {
struct kvm kvm;
@@ -591,6 +735,11 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
+struct vmx_msrs {
+ unsigned int nr;
+ struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
+};
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
unsigned long host_rsp;
@@ -624,9 +773,8 @@ struct vcpu_vmx {
struct loaded_vmcs *loaded_vmcs;
bool __launched; /* temporary, used in vmx_vcpu_run */
struct msr_autoload {
- unsigned nr;
- struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
- struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+ struct vmx_msrs guest;
+ struct vmx_msrs host;
} msr_autoload;
struct {
int loaded;
@@ -2182,9 +2330,20 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
vm_exit_controls_clearbit(vmx, exit);
}
+static int find_msr(struct vmx_msrs *m, unsigned int msr)
+{
+ unsigned int i;
+
+ for (i = 0; i < m->nr; ++i) {
+ if (m->val[i].index == msr)
+ return i;
+ }
+ return -ENOENT;
+}
+
static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
- unsigned i;
+ int i;
struct msr_autoload *m = &vmx->msr_autoload;
switch (msr) {
@@ -2205,18 +2364,21 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
}
break;
}
+ i = find_msr(&m->guest, msr);
+ if (i < 0)
+ goto skip_guest;
+ --m->guest.nr;
+ m->guest.val[i] = m->guest.val[m->guest.nr];
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
- for (i = 0; i < m->nr; ++i)
- if (m->guest[i].index == msr)
- break;
-
- if (i == m->nr)
+skip_guest:
+ i = find_msr(&m->host, msr);
+ if (i < 0)
return;
- --m->nr;
- m->guest[i] = m->guest[m->nr];
- m->host[i] = m->host[m->nr];
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+
+ --m->host.nr;
+ m->host.val[i] = m->host.val[m->host.nr];
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
@@ -2231,9 +2393,9 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
}
static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
- u64 guest_val, u64 host_val)
+ u64 guest_val, u64 host_val, bool entry_only)
{
- unsigned i;
+ int i, j = 0;
struct msr_autoload *m = &vmx->msr_autoload;
switch (msr) {
@@ -2268,24 +2430,31 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
- for (i = 0; i < m->nr; ++i)
- if (m->guest[i].index == msr)
- break;
+ i = find_msr(&m->guest, msr);
+ if (!entry_only)
+ j = find_msr(&m->host, msr);
- if (i == NR_AUTOLOAD_MSRS) {
+ if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
- } else if (i == m->nr) {
- ++m->nr;
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
}
+ if (i < 0) {
+ i = m->guest.nr++;
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+ }
+ m->guest.val[i].index = msr;
+ m->guest.val[i].value = guest_val;
+
+ if (entry_only)
+ return;
- m->guest[i].index = msr;
- m->guest[i].value = guest_val;
- m->host[i].index = msr;
- m->host[i].value = host_val;
+ if (j < 0) {
+ j = m->host.nr++;
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+ }
+ m->host.val[j].index = msr;
+ m->host.val[j].value = host_val;
}
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
@@ -2329,7 +2498,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
guest_efer &= ~EFER_LME;
if (guest_efer != host_efer)
add_atomic_switch_msr(vmx, MSR_EFER,
- guest_efer, host_efer);
+ guest_efer, host_efer, false);
return false;
} else {
guest_efer &= ~ignore_bits;
@@ -3775,7 +3944,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.ia32_xss = data;
if (vcpu->arch.ia32_xss != host_xss)
add_atomic_switch_msr(vmx, MSR_IA32_XSS,
- vcpu->arch.ia32_xss, host_xss);
+ vcpu->arch.ia32_xss, host_xss, false);
else
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
@@ -6041,9 +6210,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
- vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
@@ -6063,8 +6232,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
+ vmx->arch_capabilities = kvm_get_arch_capabilities();
vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
@@ -9282,6 +9450,79 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
}
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ /*
+ * This code is only executed when the the flush mode is 'cond' or
+ * 'always'
+ */
+ if (static_branch_likely(&vmx_l1d_flush_cond)) {
+ bool flush_l1d;
+
+ /*
+ * Clear the per-vcpu flush bit, it gets set again
+ * either from vcpu_run() or from one of the unsafe
+ * VMEXIT handlers.
+ */
+ flush_l1d = vcpu->arch.l1tf_flush_l1d;
+ vcpu->arch.l1tf_flush_l1d = false;
+
+ /*
+ * Clear the per-cpu flush bit, it gets set again from
+ * the interrupt handlers.
+ */
+ flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
+ kvm_clear_cpu_l1tf_flush_l1d();
+
+ if (!flush_l1d)
+ return;
+ }
+
+ vcpu->stat.l1d_flush++;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return;
+ }
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (vmx_l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -9688,7 +9929,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
clear_atomic_switch_msr(vmx, msrs[i].msr);
else
add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
- msrs[i].host);
+ msrs[i].host, false);
}
static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
@@ -9783,6 +10024,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
(unsigned long)&current_evmcs->host_rsp : 0;
+ if (static_branch_unlikely(&vmx_l1d_should_flush))
+ vmx_l1d_flush(vcpu);
+
asm(
/* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -10142,10 +10386,37 @@ free_vcpu:
return ERR_PTR(err);
}
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+
static int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
kvm->arch.pause_in_guest = true;
+
+ if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ /* 'I explicitly don't care' is set */
+ break;
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ /*
+ * Warn upon starting the first VM in a potentially
+ * insecure environment.
+ */
+ if (cpu_smt_control == CPU_SMT_ENABLED)
+ pr_warn_once(L1TF_MSG_SMT);
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
+ pr_warn_once(L1TF_MSG_L1D);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ /* Flush is enforced */
+ break;
+ }
+ }
return 0;
}
@@ -11005,10 +11276,10 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* Set the MSR load/store lists to match L0's settings.
*/
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
- vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
- vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
set_cr4_guest_host_mask(vmx);
@@ -11642,6 +11913,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (ret)
return ret;
+ /* Hide L1D cache contents from the nested guest. */
+ vmx->vcpu.arch.l1tf_flush_l1d = true;
+
/*
* If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
* by event injection, halt vcpu.
@@ -12155,8 +12429,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmx_segment_cache_clear(vmx);
/* Update any VMCS fields that might have changed while L2 ran */
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
if (vmx->hv_deadline_tsc == -1)
vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
@@ -12868,6 +13142,51 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.enable_smi_window = enable_smi_window,
};
+static void vmx_cleanup_l1d_flush(void)
+{
+ if (vmx_l1d_flush_pages) {
+ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+ vmx_l1d_flush_pages = NULL;
+ }
+ /* Restore state so sysfs ignores VMX */
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+
+static void vmx_exit(void)
+{
+#ifdef CONFIG_KEXEC_CORE
+ RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
+ synchronize_rcu();
+#endif
+
+ kvm_exit();
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (static_branch_unlikely(&enable_evmcs)) {
+ int cpu;
+ struct hv_vp_assist_page *vp_ap;
+ /*
+ * Reset everything to support using non-enlightened VMCS
+ * access later (e.g. when we reload the module with
+ * enlightened_vmcs=0)
+ */
+ for_each_online_cpu(cpu) {
+ vp_ap = hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+ }
+
+ static_branch_disable(&enable_evmcs);
+ }
+#endif
+ vmx_cleanup_l1d_flush();
+}
+module_exit(vmx_exit);
+
static int __init vmx_init(void)
{
int r;
@@ -12902,10 +13221,25 @@ static int __init vmx_init(void)
#endif
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
+ /*
+ * Must be called after kvm_init() so enable_ept is properly set
+ * up. Hand the parameter mitigation value in which was stored in
+ * the pre module init parser. If no parameter was given, it will
+ * contain 'auto' which will be turned into the default 'cond'
+ * mitigation mode.
+ */
+ if (boot_cpu_has(X86_BUG_L1TF)) {
+ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+ if (r) {
+ vmx_exit();
+ return r;
+ }
+ }
+
#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
@@ -12913,39 +13247,4 @@ static int __init vmx_init(void)
return 0;
}
-
-static void __exit vmx_exit(void)
-{
-#ifdef CONFIG_KEXEC_CORE
- RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
- synchronize_rcu();
-#endif
-
- kvm_exit();
-
-#if IS_ENABLED(CONFIG_HYPERV)
- if (static_branch_unlikely(&enable_evmcs)) {
- int cpu;
- struct hv_vp_assist_page *vp_ap;
- /*
- * Reset everything to support using non-enlightened VMCS
- * access later (e.g. when we reload the module with
- * enlightened_vmcs=0)
- */
- for_each_online_cpu(cpu) {
- vp_ap = hv_get_vp_assist_page(cpu);
-
- if (!vp_ap)
- continue;
-
- vp_ap->current_nested_vmcs = 0;
- vp_ap->enlighten_vmentry = 0;
- }
-
- static_branch_disable(&enable_evmcs);
- }
-#endif
-}
-
-module_init(vmx_init)
-module_exit(vmx_exit)
+module_init(vmx_init);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ac01341f2d1f..0125698b9b70 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -194,6 +194,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "irq_injections", VCPU_STAT(irq_injections) },
{ "nmi_injections", VCPU_STAT(nmi_injections) },
{ "req_event", VCPU_STAT(req_event) },
+ { "l1d_flush", VCPU_STAT(l1d_flush) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -1097,11 +1098,35 @@ static u32 msr_based_features[] = {
static unsigned int num_msr_based_features;
+u64 kvm_get_arch_capabilities(void)
+{
+ u64 data;
+
+ rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
+
+ /*
+ * If we're doing cache flushes (either "always" or "cond")
+ * we will do one whenever the guest does a vmlaunch/vmresume.
+ * If an outer hypervisor is doing the cache flush for us
+ * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
+ * capability to the guest too, and if EPT is disabled we're not
+ * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
+ * require a nested hypervisor to do a flush of its own.
+ */
+ if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
+ data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+
+ return data;
+}
+EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
+
static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
{
switch (msr->index) {
- case MSR_IA32_UCODE_REV:
case MSR_IA32_ARCH_CAPABILITIES:
+ msr->data = kvm_get_arch_capabilities();
+ break;
+ case MSR_IA32_UCODE_REV:
rdmsrl_safe(msr->index, &msr->data);
break;
default:
@@ -4870,6 +4895,9 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
unsigned int bytes, struct x86_exception *exception)
{
+ /* kvm_write_guest_virt_system can pull in tons of pages. */
+ vcpu->arch.l1tf_flush_l1d = true;
+
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
}
@@ -6046,6 +6074,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
bool writeback = true;
bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+ vcpu->arch.l1tf_flush_l1d = true;
+
/*
* Clear write_fault_to_shadow_pgtable here to ensure it is
* never reused.
@@ -7575,6 +7605,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
if (kvm_vcpu_running(vcpu)) {
@@ -8694,6 +8725,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
+ vcpu->arch.l1tf_flush_l1d = true;
kvm_x86_ops->sched_in(vcpu, cpu);
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cee58a972cb2..83241eb71cd4 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -4,6 +4,8 @@
#include <linux/swap.h>
#include <linux/memblock.h>
#include <linux/bootmem.h> /* for max_low_pfn */
+#include <linux/swapfile.h>
+#include <linux/swapops.h>
#include <asm/set_memory.h>
#include <asm/e820/api.h>
@@ -880,3 +882,26 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
__cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
__pte2cachemode_tbl[entry] = cache;
}
+
+#ifdef CONFIG_SWAP
+unsigned long max_swapfile_size(void)
+{
+ unsigned long pages;
+
+ pages = generic_max_swapfile_size();
+
+ if (boot_cpu_has_bug(X86_BUG_L1TF)) {
+ /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
+ unsigned long l1tf_limit = l1tf_pfn_limit() + 1;
+ /*
+ * We encode swap offsets also with 3 bits below those for pfn
+ * which makes the usable limit higher.
+ */
+#if CONFIG_PGTABLE_LEVELS > 2
+ l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
+#endif
+ pages = min_t(unsigned long, l1tf_limit, pages);
+ }
+ return pages;
+}
+#endif
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 7c8686709636..79eb55ce69a9 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -126,24 +126,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
{
+ pmd_t new_pmd;
pmdval_t v = pmd_val(*pmd);
if (clear) {
- *old = v & _PAGE_PRESENT;
- v &= ~_PAGE_PRESENT;
- } else /* presume this has been called with clear==true previously */
- v |= *old;
- set_pmd(pmd, __pmd(v));
+ *old = v;
+ new_pmd = pmd_mknotpresent(*pmd);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ new_pmd = __pmd(*old);
+ }
+ set_pmd(pmd, new_pmd);
}
static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
{
pteval_t v = pte_val(*pte);
if (clear) {
- *old = v & _PAGE_PRESENT;
- v &= ~_PAGE_PRESENT;
- } else /* presume this has been called with clear==true previously */
- v |= *old;
- set_pte_atomic(pte, __pte(v));
+ *old = v;
+ /* Nothing should care about address */
+ pte_clear(&init_mm, 0, pte);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ set_pte_atomic(pte, __pte(*old));
+ }
}
static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 48c591251600..f40ab8185d94 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -240,3 +240,24 @@ int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
return phys_addr_valid(addr + count - 1);
}
+
+/*
+ * Only allow root to set high MMIO mappings to PROT_NONE.
+ * This prevents an unpriv. user to set them to PROT_NONE and invert
+ * them, then pointing to valid memory for L1TF speculation.
+ *
+ * Note: for locked down kernels may want to disable the root override.
+ */
+bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return true;
+ if (!__pte_needs_invert(pgprot_val(prot)))
+ return true;
+ /* If it's real memory always allow */
+ if (pfn_valid(pfn))
+ return true;
+ if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN))
+ return false;
+ return true;
+}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3bded76e8d5c..29505724202a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1014,8 +1014,8 @@ static long populate_pmd(struct cpa_data *cpa,
pmd = pmd_offset(pud, start);
- set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
- massage_pgprot(pmd_pgprot)));
+ set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
+ canon_pgprot(pmd_pgprot))));
start += PMD_SIZE;
cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
@@ -1087,8 +1087,8 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
* Map everything starting from the Gb boundary, possibly with 1G pages
*/
while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
- set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
- massage_pgprot(pud_pgprot)));
+ set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
+ canon_pgprot(pud_pgprot))));
start += PUD_SIZE;
cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
@@ -1784,6 +1784,12 @@ int set_memory_nonglobal(unsigned long addr, int numpages)
__pgprot(_PAGE_GLOBAL), 0);
}
+int set_memory_global(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_GLOBAL), 0);
+}
+
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
struct cpa_data cpa;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ffc8c13c50e4..64df979a97e2 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -715,28 +715,50 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
+#ifdef CONFIG_X86_64
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
* @pud: Pointer to a PUD.
+ * @addr: Virtual address associated with pud.
*
- * Context: The pud range has been unmaped and TLB purged.
+ * Context: The pud range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
+ *
+ * NOTE: Callers must allow a single page allocation.
*/
-int pud_free_pmd_page(pud_t *pud)
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
- pmd_t *pmd;
+ pmd_t *pmd, *pmd_sv;
+ pte_t *pte;
int i;
if (pud_none(*pud))
return 1;
pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
+ if (!pmd_sv)
+ return 0;
- for (i = 0; i < PTRS_PER_PMD; i++)
- if (!pmd_free_pte_page(&pmd[i]))
- return 0;
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_sv[i] = pmd[i];
+ if (!pmd_none(pmd[i]))
+ pmd_clear(&pmd[i]);
+ }
pud_clear(pud);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd_sv[i])) {
+ pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+ free_page((unsigned long)pte);
+ }
+ }
+
+ free_page((unsigned long)pmd_sv);
free_page((unsigned long)pmd);
return 1;
@@ -745,11 +767,12 @@ int pud_free_pmd_page(pud_t *pud)
/**
* pmd_free_pte_page - Clear pmd entry and free pte page.
* @pmd: Pointer to a PMD.
+ * @addr: Virtual address associated with pmd.
*
- * Context: The pmd range has been unmaped and TLB purged.
+ * Context: The pmd range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise.
*/
-int pmd_free_pte_page(pmd_t *pmd)
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
@@ -758,8 +781,30 @@ int pmd_free_pte_page(pmd_t *pmd)
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
free_page((unsigned long)pte);
return 1;
}
+
+#else /* !CONFIG_X86_64 */
+
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+{
+ return pud_none(*pud);
+}
+
+/*
+ * Disable free page handling on x86-PAE. This assures that ioremap()
+ * does not update sync'd pmd entries. See vmalloc_sync_one().
+ */
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+{
+ return pmd_none(*pmd);
+}
+
+#endif /* CONFIG_X86_64 */
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4d418e705878..946455e9cfef 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -45,6 +45,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
+#include <asm/sections.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
@@ -435,6 +436,13 @@ static inline bool pti_kernel_image_global_ok(void)
}
/*
+ * This is the only user for these and it is not arch-generic
+ * like the other set_memory.h functions. Just extern them.
+ */
+extern int set_memory_nonglobal(unsigned long addr, int numpages);
+extern int set_memory_global(unsigned long addr, int numpages);
+
+/*
* For some configurations, map all of kernel text into the user page
* tables. This reduces TLB misses, especially on non-PCID systems.
*/
@@ -446,7 +454,8 @@ void pti_clone_kernel_text(void)
* clone the areas past rodata, they might contain secrets.
*/
unsigned long start = PFN_ALIGN(_text);
- unsigned long end = (unsigned long)__end_rodata_hpage_align;
+ unsigned long end_clone = (unsigned long)__end_rodata_hpage_align;
+ unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table);
if (!pti_kernel_image_global_ok())
return;
@@ -458,14 +467,18 @@ void pti_clone_kernel_text(void)
* pti_set_kernel_image_nonglobal() did to clear the
* global bit.
*/
- pti_clone_pmds(start, end, _PAGE_RW);
+ pti_clone_pmds(start, end_clone, _PAGE_RW);
+
+ /*
+ * pti_clone_pmds() will set the global bit in any PMDs
+ * that it clones, but we also need to get any PTEs in
+ * the last level for areas that are not huge-page-aligned.
+ */
+
+ /* Set the global bit for normal non-__init kernel text: */
+ set_memory_global(start, (end_global - start) >> PAGE_SHIFT);
}
-/*
- * This is the only user for it and it is not arch-generic like
- * the other set_memory.h functions. Just extern it.
- */
-extern int set_memory_nonglobal(unsigned long addr, int numpages);
void pti_set_kernel_image_nonglobal(void)
{
/*
@@ -477,9 +490,11 @@ void pti_set_kernel_image_nonglobal(void)
unsigned long start = PFN_ALIGN(_text);
unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
- if (pti_kernel_image_global_ok())
- return;
-
+ /*
+ * This clears _PAGE_GLOBAL from the entire kernel image.
+ * pti_clone_kernel_text() map put _PAGE_GLOBAL back for
+ * areas that are mapped to userspace.
+ */
set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT);
}
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
index 4f5fa65a1011..2acd6be13375 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
@@ -18,6 +18,7 @@
#include <asm/intel-mid.h>
#include <asm/intel_scu_ipc.h>
#include <asm/io_apic.h>
+#include <asm/hw_irq.h>
#define TANGIER_EXT_TIMER0_MSI 12
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index b36caae0fb2f..c29d3152f5a4 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1285,6 +1285,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
struct msg_desc msgdesc;
ack_APIC_irq();
+ kvm_set_cpu_l1tf_flush_l1d();
time_start = get_cycles();
bcp = &per_cpu(bau_control, smp_processor_id());
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c9081c6671f0..df208af3cd74 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -3,6 +3,7 @@
#endif
#include <linux/cpu.h>
#include <linux/kexec.h>
+#include <linux/slab.h>
#include <xen/features.h>
#include <xen/page.h>
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index ce8f6a8d4ac9..ca0fce74814b 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1258,6 +1258,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
get_cpu_cap(&boot_cpu_data);
x86_configure_nx();
+ /* Determine virtual and physical address sizes */
+ get_cpu_address_sizes(&boot_cpu_data);
+
/* Let's presume PV guests always boot on vCPU with id 0. */
per_cpu(xen_vcpu_id, 0) = 0;
diff --git a/crypto/ablkcipher.c b/crypto/ablkcipher.c
index d880a4897159..4ee7c041bb82 100644
--- a/crypto/ablkcipher.c
+++ b/crypto/ablkcipher.c
@@ -71,11 +71,9 @@ static inline u8 *ablkcipher_get_spot(u8 *start, unsigned int len)
return max(start, end_page);
}
-static inline unsigned int ablkcipher_done_slow(struct ablkcipher_walk *walk,
- unsigned int bsize)
+static inline void ablkcipher_done_slow(struct ablkcipher_walk *walk,
+ unsigned int n)
{
- unsigned int n = bsize;
-
for (;;) {
unsigned int len_this_page = scatterwalk_pagelen(&walk->out);
@@ -87,17 +85,13 @@ static inline unsigned int ablkcipher_done_slow(struct ablkcipher_walk *walk,
n -= len_this_page;
scatterwalk_start(&walk->out, sg_next(walk->out.sg));
}
-
- return bsize;
}
-static inline unsigned int ablkcipher_done_fast(struct ablkcipher_walk *walk,
- unsigned int n)
+static inline void ablkcipher_done_fast(struct ablkcipher_walk *walk,
+ unsigned int n)
{
scatterwalk_advance(&walk->in, n);
scatterwalk_advance(&walk->out, n);
-
- return n;
}
static int ablkcipher_walk_next(struct ablkcipher_request *req,
@@ -107,39 +101,40 @@ int ablkcipher_walk_done(struct ablkcipher_request *req,
struct ablkcipher_walk *walk, int err)
{
struct crypto_tfm *tfm = req->base.tfm;
- unsigned int nbytes = 0;
+ unsigned int n; /* bytes processed */
+ bool more;
- if (likely(err >= 0)) {
- unsigned int n = walk->nbytes - err;
+ if (unlikely(err < 0))
+ goto finish;
- if (likely(!(walk->flags & ABLKCIPHER_WALK_SLOW)))
- n = ablkcipher_done_fast(walk, n);
- else if (WARN_ON(err)) {
- err = -EINVAL;
- goto err;
- } else
- n = ablkcipher_done_slow(walk, n);
+ n = walk->nbytes - err;
+ walk->total -= n;
+ more = (walk->total != 0);
- nbytes = walk->total - n;
- err = 0;
+ if (likely(!(walk->flags & ABLKCIPHER_WALK_SLOW))) {
+ ablkcipher_done_fast(walk, n);
+ } else {
+ if (WARN_ON(err)) {
+ /* unexpected case; didn't process all bytes */
+ err = -EINVAL;
+ goto finish;
+ }
+ ablkcipher_done_slow(walk, n);
}
- scatterwalk_done(&walk->in, 0, nbytes);
- scatterwalk_done(&walk->out, 1, nbytes);
-
-err:
- walk->total = nbytes;
- walk->nbytes = nbytes;
+ scatterwalk_done(&walk->in, 0, more);
+ scatterwalk_done(&walk->out, 1, more);
- if (nbytes) {
+ if (more) {
crypto_yield(req->base.flags);
return ablkcipher_walk_next(req, walk);
}
-
+ err = 0;
+finish:
+ walk->nbytes = 0;
if (walk->iv != req->info)
memcpy(req->info, walk->iv, tfm->crt_ablkcipher.ivsize);
kfree(walk->iv_buffer);
-
return err;
}
EXPORT_SYMBOL_GPL(ablkcipher_walk_done);
diff --git a/crypto/blkcipher.c b/crypto/blkcipher.c
index 01c0d4aa2563..77b5fa293f66 100644
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -70,19 +70,18 @@ static inline u8 *blkcipher_get_spot(u8 *start, unsigned int len)
return max(start, end_page);
}
-static inline unsigned int blkcipher_done_slow(struct blkcipher_walk *walk,
- unsigned int bsize)
+static inline void blkcipher_done_slow(struct blkcipher_walk *walk,
+ unsigned int bsize)
{
u8 *addr;
addr = (u8 *)ALIGN((unsigned long)walk->buffer, walk->alignmask + 1);
addr = blkcipher_get_spot(addr, bsize);
scatterwalk_copychunks(addr, &walk->out, bsize, 1);
- return bsize;
}
-static inline unsigned int blkcipher_done_fast(struct blkcipher_walk *walk,
- unsigned int n)
+static inline void blkcipher_done_fast(struct blkcipher_walk *walk,
+ unsigned int n)
{
if (walk->flags & BLKCIPHER_WALK_COPY) {
blkcipher_map_dst(walk);
@@ -96,49 +95,48 @@ static inline unsigned int blkcipher_done_fast(struct blkcipher_walk *walk,
scatterwalk_advance(&walk->in, n);
scatterwalk_advance(&walk->out, n);
-
- return n;
}
int blkcipher_walk_done(struct blkcipher_desc *desc,
struct blkcipher_walk *walk, int err)
{
- unsigned int nbytes = 0;
+ unsigned int n; /* bytes processed */
+ bool more;
- if (likely(err >= 0)) {
- unsigned int n = walk->nbytes - err;
+ if (unlikely(err < 0))
+ goto finish;
- if (likely(!(walk->flags & BLKCIPHER_WALK_SLOW)))
- n = blkcipher_done_fast(walk, n);
- else if (WARN_ON(err)) {
- err = -EINVAL;
- goto err;
- } else
- n = blkcipher_done_slow(walk, n);
+ n = walk->nbytes - err;
+ walk->total -= n;
+ more = (walk->total != 0);
- nbytes = walk->total - n;
- err = 0;
+ if (likely(!(walk->flags & BLKCIPHER_WALK_SLOW))) {
+ blkcipher_done_fast(walk, n);
+ } else {
+ if (WARN_ON(err)) {
+ /* unexpected case; didn't process all bytes */
+ err = -EINVAL;
+ goto finish;
+ }
+ blkcipher_done_slow(walk, n);
}
- scatterwalk_done(&walk->in, 0, nbytes);
- scatterwalk_done(&walk->out, 1, nbytes);
+ scatterwalk_done(&walk->in, 0, more);
+ scatterwalk_done(&walk->out, 1, more);
-err:
- walk->total = nbytes;
- walk->nbytes = nbytes;
-
- if (nbytes) {
+ if (more) {
crypto_yield(desc->flags);
return blkcipher_walk_next(desc, walk);
}
-
+ err = 0;
+finish:
+ walk->nbytes = 0;
if (walk->iv != desc->info)
memcpy(desc->info, walk->iv, walk->ivsize);
if (walk->buffer != walk->page)
kfree(walk->buffer);
if (walk->page)
free_page((unsigned long)walk->page);
-
return err;
}
EXPORT_SYMBOL_GPL(blkcipher_walk_done);
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index 0fe2a2923ad0..5dc8407bdaa9 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -95,7 +95,7 @@ static inline u8 *skcipher_get_spot(u8 *start, unsigned int len)
return max(start, end_page);
}
-static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
+static void skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
{
u8 *addr;
@@ -103,23 +103,24 @@ static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
addr = skcipher_get_spot(addr, bsize);
scatterwalk_copychunks(addr, &walk->out, bsize,
(walk->flags & SKCIPHER_WALK_PHYS) ? 2 : 1);
- return 0;
}
int skcipher_walk_done(struct skcipher_walk *walk, int err)
{
- unsigned int n = walk->nbytes - err;
- unsigned int nbytes;
-
- nbytes = walk->total - n;
-
- if (unlikely(err < 0)) {
- nbytes = 0;
- n = 0;
- } else if (likely(!(walk->flags & (SKCIPHER_WALK_PHYS |
- SKCIPHER_WALK_SLOW |
- SKCIPHER_WALK_COPY |
- SKCIPHER_WALK_DIFF)))) {
+ unsigned int n; /* bytes processed */
+ bool more;
+
+ if (unlikely(err < 0))
+ goto finish;
+
+ n = walk->nbytes - err;
+ walk->total -= n;
+ more = (walk->total != 0);
+
+ if (likely(!(walk->flags & (SKCIPHER_WALK_PHYS |
+ SKCIPHER_WALK_SLOW |
+ SKCIPHER_WALK_COPY |
+ SKCIPHER_WALK_DIFF)))) {
unmap_src:
skcipher_unmap_src(walk);
} else if (walk->flags & SKCIPHER_WALK_DIFF) {
@@ -131,28 +132,28 @@ unmap_src:
skcipher_unmap_dst(walk);
} else if (unlikely(walk->flags & SKCIPHER_WALK_SLOW)) {
if (WARN_ON(err)) {
+ /* unexpected case; didn't process all bytes */
err = -EINVAL;
- nbytes = 0;
- } else
- n = skcipher_done_slow(walk, n);
+ goto finish;
+ }
+ skcipher_done_slow(walk, n);
+ goto already_advanced;
}
- if (err > 0)
- err = 0;
-
- walk->total = nbytes;
- walk->nbytes = nbytes;
-
scatterwalk_advance(&walk->in, n);
scatterwalk_advance(&walk->out, n);
- scatterwalk_done(&walk->in, 0, nbytes);
- scatterwalk_done(&walk->out, 1, nbytes);
+already_advanced:
+ scatterwalk_done(&walk->in, 0, more);
+ scatterwalk_done(&walk->out, 1, more);
- if (nbytes) {
+ if (more) {
crypto_yield(walk->flags & SKCIPHER_WALK_SLEEP ?
CRYPTO_TFM_REQ_MAY_SLEEP : 0);
return skcipher_walk_next(walk);
}
+ err = 0;
+finish:
+ walk->nbytes = 0;
/* Short-circuit for the common/fast path. */
if (!((unsigned long)walk->buffer | (unsigned long)walk->page))
@@ -399,7 +400,7 @@ static int skcipher_copy_iv(struct skcipher_walk *walk)
unsigned size;
u8 *iv;
- aligned_bs = ALIGN(bs, alignmask);
+ aligned_bs = ALIGN(bs, alignmask + 1);
/* Minimum size to align buffer by alignmask. */
size = alignmask & ~a;
diff --git a/crypto/vmac.c b/crypto/vmac.c
index df76a816cfb2..bb2fc787d615 100644
--- a/crypto/vmac.c
+++ b/crypto/vmac.c
@@ -1,6 +1,10 @@
/*
- * Modified to interface to the Linux kernel
+ * VMAC: Message Authentication Code using Universal Hashing
+ *
+ * Reference: https://tools.ietf.org/html/draft-krovetz-vmac-01
+ *
* Copyright (c) 2009, Intel Corporation.
+ * Copyright (c) 2018, Google Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -16,14 +20,15 @@
* Place - Suite 330, Boston, MA 02111-1307 USA.
*/
-/* --------------------------------------------------------------------------
- * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai.
- * This implementation is herby placed in the public domain.
- * The authors offers no warranty. Use at your own risk.
- * Please send bug reports to the authors.
- * Last modified: 17 APR 08, 1700 PDT
- * ----------------------------------------------------------------------- */
+/*
+ * Derived from:
+ * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai.
+ * This implementation is herby placed in the public domain.
+ * The authors offers no warranty. Use at your own risk.
+ * Last modified: 17 APR 08, 1700 PDT
+ */
+#include <asm/unaligned.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/crypto.h>
@@ -31,10 +36,36 @@
#include <linux/scatterlist.h>
#include <asm/byteorder.h>
#include <crypto/scatterwalk.h>
-#include <crypto/vmac.h>
#include <crypto/internal/hash.h>
/*
+ * User definable settings.
+ */
+#define VMAC_TAG_LEN 64
+#define VMAC_KEY_SIZE 128/* Must be 128, 192 or 256 */
+#define VMAC_KEY_LEN (VMAC_KEY_SIZE/8)
+#define VMAC_NHBYTES 128/* Must 2^i for any 3 < i < 13 Standard = 128*/
+
+/* per-transform (per-key) context */
+struct vmac_tfm_ctx {
+ struct crypto_cipher *cipher;
+ u64 nhkey[(VMAC_NHBYTES/8)+2*(VMAC_TAG_LEN/64-1)];
+ u64 polykey[2*VMAC_TAG_LEN/64];
+ u64 l3key[2*VMAC_TAG_LEN/64];
+};
+
+/* per-request context */
+struct vmac_desc_ctx {
+ union {
+ u8 partial[VMAC_NHBYTES]; /* partial block */
+ __le64 partial_words[VMAC_NHBYTES / 8];
+ };
+ unsigned int partial_size; /* size of the partial block */
+ bool first_block_processed;
+ u64 polytmp[2*VMAC_TAG_LEN/64]; /* running total of L2-hash */
+};
+
+/*
* Constants and masks
*/
#define UINT64_C(x) x##ULL
@@ -318,13 +349,6 @@ static void poly_step_func(u64 *ahi, u64 *alo,
} while (0)
#endif
-static void vhash_abort(struct vmac_ctx *ctx)
-{
- ctx->polytmp[0] = ctx->polykey[0] ;
- ctx->polytmp[1] = ctx->polykey[1] ;
- ctx->first_block_processed = 0;
-}
-
static u64 l3hash(u64 p1, u64 p2, u64 k1, u64 k2, u64 len)
{
u64 rh, rl, t, z = 0;
@@ -364,280 +388,209 @@ static u64 l3hash(u64 p1, u64 p2, u64 k1, u64 k2, u64 len)
return rl;
}
-static void vhash_update(const unsigned char *m,
- unsigned int mbytes, /* Pos multiple of VMAC_NHBYTES */
- struct vmac_ctx *ctx)
+/* L1 and L2-hash one or more VMAC_NHBYTES-byte blocks */
+static void vhash_blocks(const struct vmac_tfm_ctx *tctx,
+ struct vmac_desc_ctx *dctx,
+ const __le64 *mptr, unsigned int blocks)
{
- u64 rh, rl, *mptr;
- const u64 *kptr = (u64 *)ctx->nhkey;
- int i;
- u64 ch, cl;
- u64 pkh = ctx->polykey[0];
- u64 pkl = ctx->polykey[1];
-
- if (!mbytes)
- return;
-
- BUG_ON(mbytes % VMAC_NHBYTES);
-
- mptr = (u64 *)m;
- i = mbytes / VMAC_NHBYTES; /* Must be non-zero */
-
- ch = ctx->polytmp[0];
- cl = ctx->polytmp[1];
-
- if (!ctx->first_block_processed) {
- ctx->first_block_processed = 1;
+ const u64 *kptr = tctx->nhkey;
+ const u64 pkh = tctx->polykey[0];
+ const u64 pkl = tctx->polykey[1];
+ u64 ch = dctx->polytmp[0];
+ u64 cl = dctx->polytmp[1];
+ u64 rh, rl;
+
+ if (!dctx->first_block_processed) {
+ dctx->first_block_processed = true;
nh_vmac_nhbytes(mptr, kptr, VMAC_NHBYTES/8, rh, rl);
rh &= m62;
ADD128(ch, cl, rh, rl);
mptr += (VMAC_NHBYTES/sizeof(u64));
- i--;
+ blocks--;
}
- while (i--) {
+ while (blocks--) {
nh_vmac_nhbytes(mptr, kptr, VMAC_NHBYTES/8, rh, rl);
rh &= m62;
poly_step(ch, cl, pkh, pkl, rh, rl);
mptr += (VMAC_NHBYTES/sizeof(u64));
}
- ctx->polytmp[0] = ch;
- ctx->polytmp[1] = cl;
+ dctx->polytmp[0] = ch;
+ dctx->polytmp[1] = cl;
}
-static u64 vhash(unsigned char m[], unsigned int mbytes,
- u64 *tagl, struct vmac_ctx *ctx)
+static int vmac_setkey(struct crypto_shash *tfm,
+ const u8 *key, unsigned int keylen)
{
- u64 rh, rl, *mptr;
- const u64 *kptr = (u64 *)ctx->nhkey;
- int i, remaining;
- u64 ch, cl;
- u64 pkh = ctx->polykey[0];
- u64 pkl = ctx->polykey[1];
-
- mptr = (u64 *)m;
- i = mbytes / VMAC_NHBYTES;
- remaining = mbytes % VMAC_NHBYTES;
-
- if (ctx->first_block_processed) {
- ch = ctx->polytmp[0];
- cl = ctx->polytmp[1];
- } else if (i) {
- nh_vmac_nhbytes(mptr, kptr, VMAC_NHBYTES/8, ch, cl);
- ch &= m62;
- ADD128(ch, cl, pkh, pkl);
- mptr += (VMAC_NHBYTES/sizeof(u64));
- i--;
- } else if (remaining) {
- nh_16(mptr, kptr, 2*((remaining+15)/16), ch, cl);
- ch &= m62;
- ADD128(ch, cl, pkh, pkl);
- mptr += (VMAC_NHBYTES/sizeof(u64));
- goto do_l3;
- } else {/* Empty String */
- ch = pkh; cl = pkl;
- goto do_l3;
- }
-
- while (i--) {
- nh_vmac_nhbytes(mptr, kptr, VMAC_NHBYTES/8, rh, rl);
- rh &= m62;
- poly_step(ch, cl, pkh, pkl, rh, rl);
- mptr += (VMAC_NHBYTES/sizeof(u64));
- }
- if (remaining) {
- nh_16(mptr, kptr, 2*((remaining+15)/16), rh, rl);
- rh &= m62;
- poly_step(ch, cl, pkh, pkl, rh, rl);
- }
-
-do_l3:
- vhash_abort(ctx);
- remaining *= 8;
- return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1], remaining);
-}
+ struct vmac_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+ __be64 out[2];
+ u8 in[16] = { 0 };
+ unsigned int i;
+ int err;
-static u64 vmac(unsigned char m[], unsigned int mbytes,
- const unsigned char n[16], u64 *tagl,
- struct vmac_ctx_t *ctx)
-{
- u64 *in_n, *out_p;
- u64 p, h;
- int i;
-
- in_n = ctx->__vmac_ctx.cached_nonce;
- out_p = ctx->__vmac_ctx.cached_aes;
-
- i = n[15] & 1;
- if ((*(u64 *)(n+8) != in_n[1]) || (*(u64 *)(n) != in_n[0])) {
- in_n[0] = *(u64 *)(n);
- in_n[1] = *(u64 *)(n+8);
- ((unsigned char *)in_n)[15] &= 0xFE;
- crypto_cipher_encrypt_one(ctx->child,
- (unsigned char *)out_p, (unsigned char *)in_n);
-
- ((unsigned char *)in_n)[15] |= (unsigned char)(1-i);
+ if (keylen != VMAC_KEY_LEN) {
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
}
- p = be64_to_cpup(out_p + i);
- h = vhash(m, mbytes, (u64 *)0, &ctx->__vmac_ctx);
- return le64_to_cpu(p + h);
-}
-static int vmac_set_key(unsigned char user_key[], struct vmac_ctx_t *ctx)
-{
- u64 in[2] = {0}, out[2];
- unsigned i;
- int err = 0;
-
- err = crypto_cipher_setkey(ctx->child, user_key, VMAC_KEY_LEN);
+ err = crypto_cipher_setkey(tctx->cipher, key, keylen);
if (err)
return err;
/* Fill nh key */
- ((unsigned char *)in)[0] = 0x80;
- for (i = 0; i < sizeof(ctx->__vmac_ctx.nhkey)/8; i += 2) {
- crypto_cipher_encrypt_one(ctx->child,
- (unsigned char *)out, (unsigned char *)in);
- ctx->__vmac_ctx.nhkey[i] = be64_to_cpup(out);
- ctx->__vmac_ctx.nhkey[i+1] = be64_to_cpup(out+1);
- ((unsigned char *)in)[15] += 1;
+ in[0] = 0x80;
+ for (i = 0; i < ARRAY_SIZE(tctx->nhkey); i += 2) {
+ crypto_cipher_encrypt_one(tctx->cipher, (u8 *)out, in);
+ tctx->nhkey[i] = be64_to_cpu(out[0]);
+ tctx->nhkey[i+1] = be64_to_cpu(out[1]);
+ in[15]++;
}
/* Fill poly key */
- ((unsigned char *)in)[0] = 0xC0;
- in[1] = 0;
- for (i = 0; i < sizeof(ctx->__vmac_ctx.polykey)/8; i += 2) {
- crypto_cipher_encrypt_one(ctx->child,
- (unsigned char *)out, (unsigned char *)in);
- ctx->__vmac_ctx.polytmp[i] =
- ctx->__vmac_ctx.polykey[i] =
- be64_to_cpup(out) & mpoly;
- ctx->__vmac_ctx.polytmp[i+1] =
- ctx->__vmac_ctx.polykey[i+1] =
- be64_to_cpup(out+1) & mpoly;
- ((unsigned char *)in)[15] += 1;
+ in[0] = 0xC0;
+ in[15] = 0;
+ for (i = 0; i < ARRAY_SIZE(tctx->polykey); i += 2) {
+ crypto_cipher_encrypt_one(tctx->cipher, (u8 *)out, in);
+ tctx->polykey[i] = be64_to_cpu(out[0]) & mpoly;
+ tctx->polykey[i+1] = be64_to_cpu(out[1]) & mpoly;
+ in[15]++;
}
/* Fill ip key */
- ((unsigned char *)in)[0] = 0xE0;
- in[1] = 0;
- for (i = 0; i < sizeof(ctx->__vmac_ctx.l3key)/8; i += 2) {
+ in[0] = 0xE0;
+ in[15] = 0;
+ for (i = 0; i < ARRAY_SIZE(tctx->l3key); i += 2) {
do {
- crypto_cipher_encrypt_one(ctx->child,
- (unsigned char *)out, (unsigned char *)in);
- ctx->__vmac_ctx.l3key[i] = be64_to_cpup(out);
- ctx->__vmac_ctx.l3key[i+1] = be64_to_cpup(out+1);
- ((unsigned char *)in)[15] += 1;
- } while (ctx->__vmac_ctx.l3key[i] >= p64
- || ctx->__vmac_ctx.l3key[i+1] >= p64);
+ crypto_cipher_encrypt_one(tctx->cipher, (u8 *)out, in);
+ tctx->l3key[i] = be64_to_cpu(out[0]);
+ tctx->l3key[i+1] = be64_to_cpu(out[1]);
+ in[15]++;
+ } while (tctx->l3key[i] >= p64 || tctx->l3key[i+1] >= p64);
}
- /* Invalidate nonce/aes cache and reset other elements */
- ctx->__vmac_ctx.cached_nonce[0] = (u64)-1; /* Ensure illegal nonce */
- ctx->__vmac_ctx.cached_nonce[1] = (u64)0; /* Ensure illegal nonce */
- ctx->__vmac_ctx.first_block_processed = 0;
-
- return err;
+ return 0;
}
-static int vmac_setkey(struct crypto_shash *parent,
- const u8 *key, unsigned int keylen)
+static int vmac_init(struct shash_desc *desc)
{
- struct vmac_ctx_t *ctx = crypto_shash_ctx(parent);
+ const struct vmac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ struct vmac_desc_ctx *dctx = shash_desc_ctx(desc);
- if (keylen != VMAC_KEY_LEN) {
- crypto_shash_set_flags(parent, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
-
- return vmac_set_key((u8 *)key, ctx);
-}
-
-static int vmac_init(struct shash_desc *pdesc)
-{
+ dctx->partial_size = 0;
+ dctx->first_block_processed = false;
+ memcpy(dctx->polytmp, tctx->polykey, sizeof(dctx->polytmp));
return 0;
}
-static int vmac_update(struct shash_desc *pdesc, const u8 *p,
- unsigned int len)
+static int vmac_update(struct shash_desc *desc, const u8 *p, unsigned int len)
{
- struct crypto_shash *parent = pdesc->tfm;
- struct vmac_ctx_t *ctx = crypto_shash_ctx(parent);
- int expand;
- int min;
-
- expand = VMAC_NHBYTES - ctx->partial_size > 0 ?
- VMAC_NHBYTES - ctx->partial_size : 0;
-
- min = len < expand ? len : expand;
-
- memcpy(ctx->partial + ctx->partial_size, p, min);
- ctx->partial_size += min;
-
- if (len < expand)
- return 0;
-
- vhash_update(ctx->partial, VMAC_NHBYTES, &ctx->__vmac_ctx);
- ctx->partial_size = 0;
-
- len -= expand;
- p += expand;
+ const struct vmac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ struct vmac_desc_ctx *dctx = shash_desc_ctx(desc);
+ unsigned int n;
+
+ if (dctx->partial_size) {
+ n = min(len, VMAC_NHBYTES - dctx->partial_size);
+ memcpy(&dctx->partial[dctx->partial_size], p, n);
+ dctx->partial_size += n;
+ p += n;
+ len -= n;
+ if (dctx->partial_size == VMAC_NHBYTES) {
+ vhash_blocks(tctx, dctx, dctx->partial_words, 1);
+ dctx->partial_size = 0;
+ }
+ }
- if (len % VMAC_NHBYTES) {
- memcpy(ctx->partial, p + len - (len % VMAC_NHBYTES),
- len % VMAC_NHBYTES);
- ctx->partial_size = len % VMAC_NHBYTES;
+ if (len >= VMAC_NHBYTES) {
+ n = round_down(len, VMAC_NHBYTES);
+ /* TODO: 'p' may be misaligned here */
+ vhash_blocks(tctx, dctx, (const __le64 *)p, n / VMAC_NHBYTES);
+ p += n;
+ len -= n;
}
- vhash_update(p, len - len % VMAC_NHBYTES, &ctx->__vmac_ctx);
+ if (len) {
+ memcpy(dctx->partial, p, len);
+ dctx->partial_size = len;
+ }
return 0;
}
-static int vmac_final(struct shash_desc *pdesc, u8 *out)
+static u64 vhash_final(const struct vmac_tfm_ctx *tctx,
+ struct vmac_desc_ctx *dctx)
{
- struct crypto_shash *parent = pdesc->tfm;
- struct vmac_ctx_t *ctx = crypto_shash_ctx(parent);
- vmac_t mac;
- u8 nonce[16] = {};
-
- /* vmac() ends up accessing outside the array bounds that
- * we specify. In appears to access up to the next 2-word
- * boundary. We'll just be uber cautious and zero the
- * unwritten bytes in the buffer.
- */
- if (ctx->partial_size) {
- memset(ctx->partial + ctx->partial_size, 0,
- VMAC_NHBYTES - ctx->partial_size);
+ unsigned int partial = dctx->partial_size;
+ u64 ch = dctx->polytmp[0];
+ u64 cl = dctx->polytmp[1];
+
+ /* L1 and L2-hash the final block if needed */
+ if (partial) {
+ /* Zero-pad to next 128-bit boundary */
+ unsigned int n = round_up(partial, 16);
+ u64 rh, rl;
+
+ memset(&dctx->partial[partial], 0, n - partial);
+ nh_16(dctx->partial_words, tctx->nhkey, n / 8, rh, rl);
+ rh &= m62;
+ if (dctx->first_block_processed)
+ poly_step(ch, cl, tctx->polykey[0], tctx->polykey[1],
+ rh, rl);
+ else
+ ADD128(ch, cl, rh, rl);
}
- mac = vmac(ctx->partial, ctx->partial_size, nonce, NULL, ctx);
- memcpy(out, &mac, sizeof(vmac_t));
- memzero_explicit(&mac, sizeof(vmac_t));
- memset(&ctx->__vmac_ctx, 0, sizeof(struct vmac_ctx));
- ctx->partial_size = 0;
+
+ /* L3-hash the 128-bit output of L2-hash */
+ return l3hash(ch, cl, tctx->l3key[0], tctx->l3key[1], partial * 8);
+}
+
+static int vmac_final(struct shash_desc *desc, u8 *out)
+{
+ const struct vmac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ struct vmac_desc_ctx *dctx = shash_desc_ctx(desc);
+ static const u8 nonce[16] = {}; /* TODO: this is insecure */
+ union {
+ u8 bytes[16];
+ __be64 pads[2];
+ } block;
+ int index;
+ u64 hash, pad;
+
+ /* Finish calculating the VHASH of the message */
+ hash = vhash_final(tctx, dctx);
+
+ /* Generate pseudorandom pad by encrypting the nonce */
+ memcpy(&block, nonce, 16);
+ index = block.bytes[15] & 1;
+ block.bytes[15] &= ~1;
+ crypto_cipher_encrypt_one(tctx->cipher, block.bytes, block.bytes);
+ pad = be64_to_cpu(block.pads[index]);
+
+ /* The VMAC is the sum of VHASH and the pseudorandom pad */
+ put_unaligned_le64(hash + pad, out);
return 0;
}
static int vmac_init_tfm(struct crypto_tfm *tfm)
{
- struct crypto_cipher *cipher;
- struct crypto_instance *inst = (void *)tfm->__crt_alg;
+ struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
struct crypto_spawn *spawn = crypto_instance_ctx(inst);
- struct vmac_ctx_t *ctx = crypto_tfm_ctx(tfm);
+ struct vmac_tfm_ctx *tctx = crypto_tfm_ctx(tfm);
+ struct crypto_cipher *cipher;
cipher = crypto_spawn_cipher(spawn);
if (IS_ERR(cipher))
return PTR_ERR(cipher);
- ctx->child = cipher;
+ tctx->cipher = cipher;
return 0;
}
static void vmac_exit_tfm(struct crypto_tfm *tfm)
{
- struct vmac_ctx_t *ctx = crypto_tfm_ctx(tfm);
- crypto_free_cipher(ctx->child);
+ struct vmac_tfm_ctx *tctx = crypto_tfm_ctx(tfm);
+
+ crypto_free_cipher(tctx->cipher);
}
static int vmac_create(struct crypto_template *tmpl, struct rtattr **tb)
@@ -655,6 +608,10 @@ static int vmac_create(struct crypto_template *tmpl, struct rtattr **tb)
if (IS_ERR(alg))
return PTR_ERR(alg);
+ err = -EINVAL;
+ if (alg->cra_blocksize != 16)
+ goto out_put_alg;
+
inst = shash_alloc_instance("vmac", alg);
err = PTR_ERR(inst);
if (IS_ERR(inst))
@@ -670,11 +627,12 @@ static int vmac_create(struct crypto_template *tmpl, struct rtattr **tb)
inst->alg.base.cra_blocksize = alg->cra_blocksize;
inst->alg.base.cra_alignmask = alg->cra_alignmask;
- inst->alg.digestsize = sizeof(vmac_t);
- inst->alg.base.cra_ctxsize = sizeof(struct vmac_ctx_t);
+ inst->alg.base.cra_ctxsize = sizeof(struct vmac_tfm_ctx);
inst->alg.base.cra_init = vmac_init_tfm;
inst->alg.base.cra_exit = vmac_exit_tfm;
+ inst->alg.descsize = sizeof(struct vmac_desc_ctx);
+ inst->alg.digestsize = VMAC_TAG_LEN / 8;
inst->alg.init = vmac_init;
inst->alg.update = vmac_update;
inst->alg.final = vmac_final;
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 974e58457697..af54d7bbb173 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -338,6 +338,14 @@ static const struct dmi_system_id acpisleep_dmi_table[] __initconst = {
DMI_MATCH(DMI_PRODUCT_NAME, "K54HR"),
},
},
+ {
+ .callback = init_nvs_save_s3,
+ .ident = "Asus 1025C",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "1025C"),
+ },
+ },
/*
* https://bugzilla.kernel.org/show_bug.cgi?id=189431
* Lenovo G50-45 is a platform later than 2012, but needs nvs memory
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 30cc9c877ebb..eb9443d5bae1 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -540,16 +540,24 @@ ssize_t __weak cpu_show_spec_store_bypass(struct device *dev,
return sprintf(buf, "Not affected\n");
}
+ssize_t __weak cpu_show_l1tf(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "Not affected\n");
+}
+
static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
+static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
static struct attribute *cpu_root_vulnerabilities_attrs[] = {
&dev_attr_meltdown.attr,
&dev_attr_spectre_v1.attr,
&dev_attr_spectre_v2.attr,
&dev_attr_spec_store_bypass.attr,
+ &dev_attr_l1tf.attr,
NULL
};
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 0f3fadd71230..9b0b69598e23 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -280,7 +280,8 @@ static void reset_bdev(struct zram *zram)
zram->backing_dev = NULL;
zram->old_block_size = 0;
zram->bdev = NULL;
-
+ zram->disk->queue->backing_dev_info->capabilities |=
+ BDI_CAP_SYNCHRONOUS_IO;
kvfree(zram->bitmap);
zram->bitmap = NULL;
}
@@ -382,6 +383,18 @@ static ssize_t backing_dev_store(struct device *dev,
zram->backing_dev = backing_dev;
zram->bitmap = bitmap;
zram->nr_pages = nr_pages;
+ /*
+ * With writeback feature, zram does asynchronous IO so it's no longer
+ * synchronous device so let's remove synchronous io flag. Othewise,
+ * upper layer(e.g., swap) could wait IO completion rather than
+ * (submit and return), which will cause system sluggish.
+ * Furthermore, when the IO function returns(e.g., swap_readpage),
+ * upper layer expects IO was done so it could deallocate the page
+ * freely but in fact, IO is going on so finally could cause
+ * use-after-free when the IO is really done.
+ */
+ zram->disk->queue->backing_dev_info->capabilities &=
+ ~BDI_CAP_SYNCHRONOUS_IO;
up_write(&zram->init_lock);
pr_info("setup backing device %s\n", file_name);
diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index d95ec526587a..2867612eaf61 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -78,8 +78,6 @@ done:
static void sev_wait_cmd_ioc(struct psp_device *psp, unsigned int *reg)
{
- psp->sev_int_rcvd = 0;
-
wait_event(psp->sev_int_queue, psp->sev_int_rcvd);
*reg = ioread32(psp->io_regs + PSP_CMDRESP);
}
@@ -140,6 +138,8 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
iowrite32(phys_lsb, psp->io_regs + PSP_CMDBUFF_ADDR_LO);
iowrite32(phys_msb, psp->io_regs + PSP_CMDBUFF_ADDR_HI);
+ psp->sev_int_rcvd = 0;
+
reg = cmd;
reg <<= PSP_CMDRESP_CMD_SHIFT;
reg |= PSP_CMDRESP_IOC;
@@ -732,6 +732,9 @@ void psp_dev_destroy(struct sp_device *sp)
{
struct psp_device *psp = sp->psp_data;
+ if (!psp)
+ return;
+
if (psp->sev_misc)
kref_put(&misc_dev->refcount, sev_exit);
diff --git a/drivers/crypto/ccree/cc_cipher.c b/drivers/crypto/ccree/cc_cipher.c
index df98f7afe645..86e568a6d217 100644
--- a/drivers/crypto/ccree/cc_cipher.c
+++ b/drivers/crypto/ccree/cc_cipher.c
@@ -554,34 +554,82 @@ static void cc_setup_cipher_data(struct crypto_tfm *tfm,
}
}
+/*
+ * Update a CTR-AES 128 bit counter
+ */
+static void cc_update_ctr(u8 *ctr, unsigned int increment)
+{
+ if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+ IS_ALIGNED((unsigned long)ctr, 8)) {
+
+ __be64 *high_be = (__be64 *)ctr;
+ __be64 *low_be = high_be + 1;
+ u64 orig_low = __be64_to_cpu(*low_be);
+ u64 new_low = orig_low + (u64)increment;
+
+ *low_be = __cpu_to_be64(new_low);
+
+ if (new_low < orig_low)
+ *high_be = __cpu_to_be64(__be64_to_cpu(*high_be) + 1);
+ } else {
+ u8 *pos = (ctr + AES_BLOCK_SIZE);
+ u8 val;
+ unsigned int size;
+
+ for (; increment; increment--)
+ for (size = AES_BLOCK_SIZE; size; size--) {
+ val = *--pos + 1;
+ *pos = val;
+ if (val)
+ break;
+ }
+ }
+}
+
static void cc_cipher_complete(struct device *dev, void *cc_req, int err)
{
struct skcipher_request *req = (struct skcipher_request *)cc_req;
struct scatterlist *dst = req->dst;
struct scatterlist *src = req->src;
struct cipher_req_ctx *req_ctx = skcipher_request_ctx(req);
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- unsigned int ivsize = crypto_skcipher_ivsize(tfm);
+ struct crypto_skcipher *sk_tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_tfm *tfm = crypto_skcipher_tfm(sk_tfm);
+ struct cc_cipher_ctx *ctx_p = crypto_tfm_ctx(tfm);
+ unsigned int ivsize = crypto_skcipher_ivsize(sk_tfm);
+ unsigned int len;
- cc_unmap_cipher_request(dev, req_ctx, ivsize, src, dst);
- kzfree(req_ctx->iv);
+ switch (ctx_p->cipher_mode) {
+ case DRV_CIPHER_CBC:
+ /*
+ * The crypto API expects us to set the req->iv to the last
+ * ciphertext block. For encrypt, simply copy from the result.
+ * For decrypt, we must copy from a saved buffer since this
+ * could be an in-place decryption operation and the src is
+ * lost by this point.
+ */
+ if (req_ctx->gen_ctx.op_type == DRV_CRYPTO_DIRECTION_DECRYPT) {
+ memcpy(req->iv, req_ctx->backup_info, ivsize);
+ kzfree(req_ctx->backup_info);
+ } else if (!err) {
+ len = req->cryptlen - ivsize;
+ scatterwalk_map_and_copy(req->iv, req->dst, len,
+ ivsize, 0);
+ }
+ break;
- /*
- * The crypto API expects us to set the req->iv to the last
- * ciphertext block. For encrypt, simply copy from the result.
- * For decrypt, we must copy from a saved buffer since this
- * could be an in-place decryption operation and the src is
- * lost by this point.
- */
- if (req_ctx->gen_ctx.op_type == DRV_CRYPTO_DIRECTION_DECRYPT) {
- memcpy(req->iv, req_ctx->backup_info, ivsize);
- kzfree(req_ctx->backup_info);
- } else if (!err) {
- scatterwalk_map_and_copy(req->iv, req->dst,
- (req->cryptlen - ivsize),
- ivsize, 0);
+ case DRV_CIPHER_CTR:
+ /* Compute the counter of the last block */
+ len = ALIGN(req->cryptlen, AES_BLOCK_SIZE) / AES_BLOCK_SIZE;
+ cc_update_ctr((u8 *)req->iv, len);
+ break;
+
+ default:
+ break;
}
+ cc_unmap_cipher_request(dev, req_ctx, ivsize, src, dst);
+ kzfree(req_ctx->iv);
+
skcipher_request_complete(req, err);
}
@@ -713,20 +761,29 @@ static int cc_cipher_encrypt(struct skcipher_request *req)
static int cc_cipher_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *sk_tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_tfm *tfm = crypto_skcipher_tfm(sk_tfm);
+ struct cc_cipher_ctx *ctx_p = crypto_tfm_ctx(tfm);
struct cipher_req_ctx *req_ctx = skcipher_request_ctx(req);
unsigned int ivsize = crypto_skcipher_ivsize(sk_tfm);
gfp_t flags = cc_gfp_flags(&req->base);
+ unsigned int len;
- /*
- * Allocate and save the last IV sized bytes of the source, which will
- * be lost in case of in-place decryption and might be needed for CTS.
- */
- req_ctx->backup_info = kmalloc(ivsize, flags);
- if (!req_ctx->backup_info)
- return -ENOMEM;
+ if (ctx_p->cipher_mode == DRV_CIPHER_CBC) {
+
+ /* Allocate and save the last IV sized bytes of the source,
+ * which will be lost in case of in-place decryption.
+ */
+ req_ctx->backup_info = kzalloc(ivsize, flags);
+ if (!req_ctx->backup_info)
+ return -ENOMEM;
+
+ len = req->cryptlen - ivsize;
+ scatterwalk_map_and_copy(req_ctx->backup_info, req->src, len,
+ ivsize, 0);
+ } else {
+ req_ctx->backup_info = NULL;
+ }
- scatterwalk_map_and_copy(req_ctx->backup_info, req->src,
- (req->cryptlen - ivsize), ivsize, 0);
req_ctx->is_giv = false;
return cc_cipher_process(req, DRV_CRYPTO_DIRECTION_DECRYPT);
diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c
index 96ff777474d7..e4ebde05a8a0 100644
--- a/drivers/crypto/ccree/cc_hash.c
+++ b/drivers/crypto/ccree/cc_hash.c
@@ -602,66 +602,7 @@ static int cc_hash_update(struct ahash_request *req)
return rc;
}
-static int cc_hash_finup(struct ahash_request *req)
-{
- struct ahash_req_ctx *state = ahash_request_ctx(req);
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct cc_hash_ctx *ctx = crypto_ahash_ctx(tfm);
- u32 digestsize = crypto_ahash_digestsize(tfm);
- struct scatterlist *src = req->src;
- unsigned int nbytes = req->nbytes;
- u8 *result = req->result;
- struct device *dev = drvdata_to_dev(ctx->drvdata);
- bool is_hmac = ctx->is_hmac;
- struct cc_crypto_req cc_req = {};
- struct cc_hw_desc desc[CC_MAX_HASH_SEQ_LEN];
- unsigned int idx = 0;
- int rc;
- gfp_t flags = cc_gfp_flags(&req->base);
-
- dev_dbg(dev, "===== %s-finup (%d) ====\n", is_hmac ? "hmac" : "hash",
- nbytes);
-
- if (cc_map_req(dev, state, ctx)) {
- dev_err(dev, "map_ahash_source() failed\n");
- return -EINVAL;
- }
-
- if (cc_map_hash_request_final(ctx->drvdata, state, src, nbytes, 1,
- flags)) {
- dev_err(dev, "map_ahash_request_final() failed\n");
- cc_unmap_req(dev, state, ctx);
- return -ENOMEM;
- }
- if (cc_map_result(dev, state, digestsize)) {
- dev_err(dev, "map_ahash_digest() failed\n");
- cc_unmap_hash_request(dev, state, src, true);
- cc_unmap_req(dev, state, ctx);
- return -ENOMEM;
- }
-
- /* Setup request structure */
- cc_req.user_cb = cc_hash_complete;
- cc_req.user_arg = req;
-
- idx = cc_restore_hash(desc, ctx, state, idx);
-
- if (is_hmac)
- idx = cc_fin_hmac(desc, req, idx);
-
- idx = cc_fin_result(desc, req, idx);
-
- rc = cc_send_request(ctx->drvdata, &cc_req, desc, idx, &req->base);
- if (rc != -EINPROGRESS && rc != -EBUSY) {
- dev_err(dev, "send_request() failed (rc=%d)\n", rc);
- cc_unmap_hash_request(dev, state, src, true);
- cc_unmap_result(dev, state, digestsize, result);
- cc_unmap_req(dev, state, ctx);
- }
- return rc;
-}
-
-static int cc_hash_final(struct ahash_request *req)
+static int cc_do_finup(struct ahash_request *req, bool update)
{
struct ahash_req_ctx *state = ahash_request_ctx(req);
struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
@@ -678,21 +619,20 @@ static int cc_hash_final(struct ahash_request *req)
int rc;
gfp_t flags = cc_gfp_flags(&req->base);
- dev_dbg(dev, "===== %s-final (%d) ====\n", is_hmac ? "hmac" : "hash",
- nbytes);
+ dev_dbg(dev, "===== %s-%s (%d) ====\n", is_hmac ? "hmac" : "hash",
+ update ? "finup" : "final", nbytes);
if (cc_map_req(dev, state, ctx)) {
dev_err(dev, "map_ahash_source() failed\n");
return -EINVAL;
}
- if (cc_map_hash_request_final(ctx->drvdata, state, src, nbytes, 0,
+ if (cc_map_hash_request_final(ctx->drvdata, state, src, nbytes, update,
flags)) {
dev_err(dev, "map_ahash_request_final() failed\n");
cc_unmap_req(dev, state, ctx);
return -ENOMEM;
}
-
if (cc_map_result(dev, state, digestsize)) {
dev_err(dev, "map_ahash_digest() failed\n");
cc_unmap_hash_request(dev, state, src, true);
@@ -706,7 +646,7 @@ static int cc_hash_final(struct ahash_request *req)
idx = cc_restore_hash(desc, ctx, state, idx);
- /* "DO-PAD" must be enabled only when writing current length to HW */
+ /* Pad the hash */
hw_desc_init(&desc[idx]);
set_cipher_do(&desc[idx], DO_PAD);
set_cipher_mode(&desc[idx], ctx->hw_mode);
@@ -731,6 +671,17 @@ static int cc_hash_final(struct ahash_request *req)
return rc;
}
+static int cc_hash_finup(struct ahash_request *req)
+{
+ return cc_do_finup(req, true);
+}
+
+
+static int cc_hash_final(struct ahash_request *req)
+{
+ return cc_do_finup(req, false);
+}
+
static int cc_hash_init(struct ahash_request *req)
{
struct ahash_req_ctx *state = ahash_request_ctx(req);
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index f0519e31543a..fbe2a9bee07f 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -24,6 +24,7 @@
#include <linux/perf_event.h>
#include <linux/pm_runtime.h>
+#include <linux/irq.h>
#include "i915_drv.h"
#include "i915_pmu.h"
diff --git a/drivers/gpu/drm/i915/intel_lpe_audio.c b/drivers/gpu/drm/i915/intel_lpe_audio.c
index 6269750e2b54..b4941101f21a 100644
--- a/drivers/gpu/drm/i915/intel_lpe_audio.c
+++ b/drivers/gpu/drm/i915/intel_lpe_audio.c
@@ -62,6 +62,7 @@
#include <linux/acpi.h>
#include <linux/device.h>
+#include <linux/irq.h>
#include <linux/pci.h>
#include <linux/pm_runtime.h>
diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index 7c6f3f5d9d9a..66ac7fa6e034 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -1640,13 +1640,7 @@ isdn_ioctl(struct file *file, uint cmd, ulong arg)
} else
return -EINVAL;
case IIOCDBGVAR:
- if (arg) {
- if (copy_to_user(argp, &dev, sizeof(ulong)))
- return -EFAULT;
- return 0;
- } else
- return -EINVAL;
- break;
+ return -EINVAL;
default:
if ((cmd & IIOCDRVCTL) == IIOCDRVCTL)
cmd = ((cmd >> _IOC_NRSHIFT) & _IOC_NRMASK) & ISDN_DRVIOCTL_MASK;
diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c
index fc0415771c00..4dd0d868ff88 100644
--- a/drivers/misc/sram.c
+++ b/drivers/misc/sram.c
@@ -407,13 +407,20 @@ static int sram_probe(struct platform_device *pdev)
if (init_func) {
ret = init_func();
if (ret)
- return ret;
+ goto err_disable_clk;
}
dev_dbg(sram->dev, "SRAM pool: %zu KiB @ 0x%p\n",
gen_pool_size(sram->pool) / 1024, sram->virt_base);
return 0;
+
+err_disable_clk:
+ if (sram->clk)
+ clk_disable_unprepare(sram->clk);
+ sram_free_partitions(sram);
+
+ return ret;
}
static int sram_remove(struct platform_device *pdev)
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
index 956860a69797..3bdab972420b 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
@@ -762,7 +762,7 @@ static int hw_atl_b0_hw_packet_filter_set(struct aq_hw_s *self,
hw_atl_rpfl2promiscuous_mode_en_set(self, IS_FILTER_ENABLED(IFF_PROMISC));
hw_atl_rpfl2multicast_flr_en_set(self,
- IS_FILTER_ENABLED(IFF_MULTICAST), 0);
+ IS_FILTER_ENABLED(IFF_ALLMULTI), 0);
hw_atl_rpfl2_accept_all_mc_packets_set(self,
IS_FILTER_ENABLED(IFF_ALLMULTI));
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 0ad2f3f7da85..82ac1d10f239 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -1901,10 +1901,10 @@ static void mvneta_rxq_drop_pkts(struct mvneta_port *pp,
}
/* Main rx processing when using software buffer management */
-static int mvneta_rx_swbm(struct mvneta_port *pp, int rx_todo,
+static int mvneta_rx_swbm(struct napi_struct *napi,
+ struct mvneta_port *pp, int rx_todo,
struct mvneta_rx_queue *rxq)
{
- struct mvneta_pcpu_port *port = this_cpu_ptr(pp->ports);
struct net_device *dev = pp->dev;
int rx_done;
u32 rcvd_pkts = 0;
@@ -1959,7 +1959,7 @@ err_drop_frame:
skb->protocol = eth_type_trans(skb, dev);
mvneta_rx_csum(pp, rx_status, skb);
- napi_gro_receive(&port->napi, skb);
+ napi_gro_receive(napi, skb);
rcvd_pkts++;
rcvd_bytes += rx_bytes;
@@ -2001,7 +2001,7 @@ err_drop_frame:
mvneta_rx_csum(pp, rx_status, skb);
- napi_gro_receive(&port->napi, skb);
+ napi_gro_receive(napi, skb);
}
if (rcvd_pkts) {
@@ -2020,10 +2020,10 @@ err_drop_frame:
}
/* Main rx processing when using hardware buffer management */
-static int mvneta_rx_hwbm(struct mvneta_port *pp, int rx_todo,
+static int mvneta_rx_hwbm(struct napi_struct *napi,
+ struct mvneta_port *pp, int rx_todo,
struct mvneta_rx_queue *rxq)
{
- struct mvneta_pcpu_port *port = this_cpu_ptr(pp->ports);
struct net_device *dev = pp->dev;
int rx_done;
u32 rcvd_pkts = 0;
@@ -2085,7 +2085,7 @@ err_drop_frame:
skb->protocol = eth_type_trans(skb, dev);
mvneta_rx_csum(pp, rx_status, skb);
- napi_gro_receive(&port->napi, skb);
+ napi_gro_receive(napi, skb);
rcvd_pkts++;
rcvd_bytes += rx_bytes;
@@ -2129,7 +2129,7 @@ err_drop_frame:
mvneta_rx_csum(pp, rx_status, skb);
- napi_gro_receive(&port->napi, skb);
+ napi_gro_receive(napi, skb);
}
if (rcvd_pkts) {
@@ -2722,9 +2722,11 @@ static int mvneta_poll(struct napi_struct *napi, int budget)
if (rx_queue) {
rx_queue = rx_queue - 1;
if (pp->bm_priv)
- rx_done = mvneta_rx_hwbm(pp, budget, &pp->rxqs[rx_queue]);
+ rx_done = mvneta_rx_hwbm(napi, pp, budget,
+ &pp->rxqs[rx_queue]);
else
- rx_done = mvneta_rx_swbm(pp, budget, &pp->rxqs[rx_queue]);
+ rx_done = mvneta_rx_swbm(napi, pp, budget,
+ &pp->rxqs[rx_queue]);
}
if (rx_done < budget) {
@@ -4018,13 +4020,18 @@ static int mvneta_config_rss(struct mvneta_port *pp)
on_each_cpu(mvneta_percpu_mask_interrupt, pp, true);
- /* We have to synchronise on the napi of each CPU */
- for_each_online_cpu(cpu) {
- struct mvneta_pcpu_port *pcpu_port =
- per_cpu_ptr(pp->ports, cpu);
+ if (!pp->neta_armada3700) {
+ /* We have to synchronise on the napi of each CPU */
+ for_each_online_cpu(cpu) {
+ struct mvneta_pcpu_port *pcpu_port =
+ per_cpu_ptr(pp->ports, cpu);
- napi_synchronize(&pcpu_port->napi);
- napi_disable(&pcpu_port->napi);
+ napi_synchronize(&pcpu_port->napi);
+ napi_disable(&pcpu_port->napi);
+ }
+ } else {
+ napi_synchronize(&pp->napi);
+ napi_disable(&pp->napi);
}
pp->rxq_def = pp->indir[0];
@@ -4041,12 +4048,16 @@ static int mvneta_config_rss(struct mvneta_port *pp)
mvneta_percpu_elect(pp);
spin_unlock(&pp->lock);
- /* We have to synchronise on the napi of each CPU */
- for_each_online_cpu(cpu) {
- struct mvneta_pcpu_port *pcpu_port =
- per_cpu_ptr(pp->ports, cpu);
+ if (!pp->neta_armada3700) {
+ /* We have to synchronise on the napi of each CPU */
+ for_each_online_cpu(cpu) {
+ struct mvneta_pcpu_port *pcpu_port =
+ per_cpu_ptr(pp->ports, cpu);
- napi_enable(&pcpu_port->napi);
+ napi_enable(&pcpu_port->napi);
+ }
+ } else {
+ napi_enable(&pp->napi);
}
netif_tx_start_all_queues(pp->dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index a0ba6cfc9092..290fc6f9afc1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1907,15 +1907,15 @@ static bool actions_match_supported(struct mlx5e_priv *priv,
static bool same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv)
{
struct mlx5_core_dev *fmdev, *pmdev;
- u16 func_id, peer_id;
+ u64 fsystem_guid, psystem_guid;
fmdev = priv->mdev;
pmdev = peer_priv->mdev;
- func_id = (u16)((fmdev->pdev->bus->number << 8) | PCI_SLOT(fmdev->pdev->devfn));
- peer_id = (u16)((pmdev->pdev->bus->number << 8) | PCI_SLOT(pmdev->pdev->devfn));
+ mlx5_query_nic_vport_system_image_guid(fmdev, &fsystem_guid);
+ mlx5_query_nic_vport_system_image_guid(pmdev, &psystem_guid);
- return (func_id == peer_id);
+ return (fsystem_guid == psystem_guid);
}
static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
index 3c0d882ba183..f6f6a568d66a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
@@ -327,12 +327,16 @@ static void mlxsw_afa_resource_add(struct mlxsw_afa_block *block,
list_add(&resource->list, &block->resource_list);
}
+static void mlxsw_afa_resource_del(struct mlxsw_afa_resource *resource)
+{
+ list_del(&resource->list);
+}
+
static void mlxsw_afa_resources_destroy(struct mlxsw_afa_block *block)
{
struct mlxsw_afa_resource *resource, *tmp;
list_for_each_entry_safe(resource, tmp, &block->resource_list, list) {
- list_del(&resource->list);
resource->destructor(block, resource);
}
}
@@ -530,6 +534,7 @@ static void
mlxsw_afa_fwd_entry_ref_destroy(struct mlxsw_afa_block *block,
struct mlxsw_afa_fwd_entry_ref *fwd_entry_ref)
{
+ mlxsw_afa_resource_del(&fwd_entry_ref->resource);
mlxsw_afa_fwd_entry_put(block->afa, fwd_entry_ref->fwd_entry);
kfree(fwd_entry_ref);
}
@@ -579,6 +584,7 @@ static void
mlxsw_afa_counter_destroy(struct mlxsw_afa_block *block,
struct mlxsw_afa_counter *counter)
{
+ mlxsw_afa_resource_del(&counter->resource);
block->afa->ops->counter_index_put(block->afa->ops_priv,
counter->counter_index);
kfree(counter);
@@ -626,8 +632,8 @@ static char *mlxsw_afa_block_append_action(struct mlxsw_afa_block *block,
char *oneact;
char *actions;
- if (WARN_ON(block->finished))
- return NULL;
+ if (block->finished)
+ return ERR_PTR(-EINVAL);
if (block->cur_act_index + action_size >
block->afa->max_acts_per_set) {
struct mlxsw_afa_set *set;
@@ -637,7 +643,7 @@ static char *mlxsw_afa_block_append_action(struct mlxsw_afa_block *block,
*/
set = mlxsw_afa_set_create(false);
if (!set)
- return NULL;
+ return ERR_PTR(-ENOBUFS);
set->prev = block->cur_set;
block->cur_act_index = 0;
block->cur_set->next = set;
@@ -724,8 +730,8 @@ int mlxsw_afa_block_append_vlan_modify(struct mlxsw_afa_block *block,
MLXSW_AFA_VLAN_CODE,
MLXSW_AFA_VLAN_SIZE);
- if (!act)
- return -ENOBUFS;
+ if (IS_ERR(act))
+ return PTR_ERR(act);
mlxsw_afa_vlan_pack(act, MLXSW_AFA_VLAN_VLAN_TAG_CMD_NOP,
MLXSW_AFA_VLAN_CMD_SET_OUTER, vid,
MLXSW_AFA_VLAN_CMD_SET_OUTER, pcp,
@@ -806,8 +812,8 @@ int mlxsw_afa_block_append_drop(struct mlxsw_afa_block *block)
MLXSW_AFA_TRAPDISC_CODE,
MLXSW_AFA_TRAPDISC_SIZE);
- if (!act)
- return -ENOBUFS;
+ if (IS_ERR(act))
+ return PTR_ERR(act);
mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_NOP,
MLXSW_AFA_TRAPDISC_FORWARD_ACTION_DISCARD, 0);
return 0;
@@ -820,8 +826,8 @@ int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block, u16 trap_id)
MLXSW_AFA_TRAPDISC_CODE,
MLXSW_AFA_TRAPDISC_SIZE);
- if (!act)
- return -ENOBUFS;
+ if (IS_ERR(act))
+ return PTR_ERR(act);
mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_TRAP,
MLXSW_AFA_TRAPDISC_FORWARD_ACTION_DISCARD,
trap_id);
@@ -836,8 +842,8 @@ int mlxsw_afa_block_append_trap_and_forward(struct mlxsw_afa_block *block,
MLXSW_AFA_TRAPDISC_CODE,
MLXSW_AFA_TRAPDISC_SIZE);
- if (!act)
- return -ENOBUFS;
+ if (IS_ERR(act))
+ return PTR_ERR(act);
mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_TRAP,
MLXSW_AFA_TRAPDISC_FORWARD_ACTION_FORWARD,
trap_id);
@@ -856,6 +862,7 @@ static void
mlxsw_afa_mirror_destroy(struct mlxsw_afa_block *block,
struct mlxsw_afa_mirror *mirror)
{
+ mlxsw_afa_resource_del(&mirror->resource);
block->afa->ops->mirror_del(block->afa->ops_priv,
mirror->local_in_port,
mirror->span_id,
@@ -908,8 +915,8 @@ mlxsw_afa_block_append_allocated_mirror(struct mlxsw_afa_block *block,
char *act = mlxsw_afa_block_append_action(block,
MLXSW_AFA_TRAPDISC_CODE,
MLXSW_AFA_TRAPDISC_SIZE);
- if (!act)
- return -ENOBUFS;
+ if (IS_ERR(act))
+ return PTR_ERR(act);
mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_NOP,
MLXSW_AFA_TRAPDISC_FORWARD_ACTION_FORWARD, 0);
mlxsw_afa_trapdisc_mirror_pack(act, true, mirror_agent);
@@ -996,8 +1003,8 @@ int mlxsw_afa_block_append_fwd(struct mlxsw_afa_block *block,
act = mlxsw_afa_block_append_action(block, MLXSW_AFA_FORWARD_CODE,
MLXSW_AFA_FORWARD_SIZE);
- if (!act) {
- err = -ENOBUFS;
+ if (IS_ERR(act)) {
+ err = PTR_ERR(act);
goto err_append_action;
}
mlxsw_afa_forward_pack(act, MLXSW_AFA_FORWARD_TYPE_PBS,
@@ -1052,8 +1059,8 @@ int mlxsw_afa_block_append_allocated_counter(struct mlxsw_afa_block *block,
{
char *act = mlxsw_afa_block_append_action(block, MLXSW_AFA_POLCNT_CODE,
MLXSW_AFA_POLCNT_SIZE);
- if (!act)
- return -ENOBUFS;
+ if (IS_ERR(act))
+ return PTR_ERR(act);
mlxsw_afa_polcnt_pack(act, MLXSW_AFA_POLCNT_COUNTER_SET_TYPE_PACKETS_BYTES,
counter_index);
return 0;
@@ -1123,8 +1130,8 @@ int mlxsw_afa_block_append_fid_set(struct mlxsw_afa_block *block, u16 fid)
char *act = mlxsw_afa_block_append_action(block,
MLXSW_AFA_VIRFWD_CODE,
MLXSW_AFA_VIRFWD_SIZE);
- if (!act)
- return -ENOBUFS;
+ if (IS_ERR(act))
+ return PTR_ERR(act);
mlxsw_afa_virfwd_pack(act, MLXSW_AFA_VIRFWD_FID_CMD_SET, fid);
return 0;
}
@@ -1193,8 +1200,8 @@ int mlxsw_afa_block_append_mcrouter(struct mlxsw_afa_block *block,
char *act = mlxsw_afa_block_append_action(block,
MLXSW_AFA_MCROUTER_CODE,
MLXSW_AFA_MCROUTER_SIZE);
- if (!act)
- return -ENOBUFS;
+ if (IS_ERR(act))
+ return PTR_ERR(act);
mlxsw_afa_mcrouter_pack(act, MLXSW_AFA_MCROUTER_RPF_ACTION_TRAP,
expected_irif, min_mtu, rmid_valid, kvdl_index);
return 0;
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 764b25fa470c..d3c6ce074571 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -8061,12 +8061,20 @@ static int rtl_alloc_irq(struct rtl8169_private *tp)
{
unsigned int flags;
- if (tp->mac_version <= RTL_GIGA_MAC_VER_06) {
+ switch (tp->mac_version) {
+ case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_06:
RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~MSIEnable);
RTL_W8(tp, Cfg9346, Cfg9346_Lock);
flags = PCI_IRQ_LEGACY;
- } else {
+ break;
+ case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_40:
+ /* This version was reported to have issues with resume
+ * from suspend when using MSI-X
+ */
+ flags = PCI_IRQ_LEGACY | PCI_IRQ_MSI;
+ break;
+ default:
flags = PCI_IRQ_ALL_TYPES;
}
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 42e93cb4eca7..ea2da91a96ea 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -894,7 +894,6 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
struct sk_buff *skb,
struct sk_buff_head *list)
{
- struct skb_shared_info *shinfo = skb_shinfo(skb);
RING_IDX cons = queue->rx.rsp_cons;
struct sk_buff *nskb;
@@ -903,15 +902,16 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
RING_GET_RESPONSE(&queue->rx, ++cons);
skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0];
- if (shinfo->nr_frags == MAX_SKB_FRAGS) {
+ if (skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to;
BUG_ON(pull_to <= skb_headlen(skb));
__pskb_pull_tail(skb, pull_to - skb_headlen(skb));
}
- BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS);
+ BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS);
- skb_add_rx_frag(skb, shinfo->nr_frags, skb_frag_page(nfrag),
+ skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+ skb_frag_page(nfrag),
rx->offset, rx->status, PAGE_SIZE);
skb_shinfo(nskb)->nr_frags = 0;
diff --git a/drivers/pci/host/pci-hyperv.c b/drivers/pci/host/pci-hyperv.c
index 4690814cfc51..c07e952b94ee 100644
--- a/drivers/pci/host/pci-hyperv.c
+++ b/drivers/pci/host/pci-hyperv.c
@@ -43,6 +43,8 @@
#include <linux/delay.h>
#include <linux/semaphore.h>
#include <linux/irqdomain.h>
+#include <linux/irq.h>
+
#include <asm/irqdomain.h>
#include <asm/apic.h>
#include <linux/msi.h>
diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c
index a91cca52b5d5..dd93a22fe843 100644
--- a/drivers/scsi/qla2xxx/qla_iocb.c
+++ b/drivers/scsi/qla2xxx/qla_iocb.c
@@ -2130,34 +2130,11 @@ __qla2x00_alloc_iocbs(struct qla_qpair *qpair, srb_t *sp)
req_cnt = 1;
handle = 0;
- if (!sp)
- goto skip_cmd_array;
-
- /* Check for room in outstanding command list. */
- handle = req->current_outstanding_cmd;
- for (index = 1; index < req->num_outstanding_cmds; index++) {
- handle++;
- if (handle == req->num_outstanding_cmds)
- handle = 1;
- if (!req->outstanding_cmds[handle])
- break;
- }
- if (index == req->num_outstanding_cmds) {
- ql_log(ql_log_warn, vha, 0x700b,
- "No room on outstanding cmd array.\n");
- goto queuing_error;
- }
-
- /* Prep command array. */
- req->current_outstanding_cmd = handle;
- req->outstanding_cmds[handle] = sp;
- sp->handle = handle;
-
- /* Adjust entry-counts as needed. */
- if (sp->type != SRB_SCSI_CMD)
+ if (sp && (sp->type != SRB_SCSI_CMD)) {
+ /* Adjust entry-counts as needed. */
req_cnt = sp->iocbs;
+ }
-skip_cmd_array:
/* Check for room on request queue. */
if (req->cnt < req_cnt + 2) {
if (qpair->use_shadow_reg)
@@ -2183,6 +2160,28 @@ skip_cmd_array:
if (req->cnt < req_cnt + 2)
goto queuing_error;
+ if (sp) {
+ /* Check for room in outstanding command list. */
+ handle = req->current_outstanding_cmd;
+ for (index = 1; index < req->num_outstanding_cmds; index++) {
+ handle++;
+ if (handle == req->num_outstanding_cmds)
+ handle = 1;
+ if (!req->outstanding_cmds[handle])
+ break;
+ }
+ if (index == req->num_outstanding_cmds) {
+ ql_log(ql_log_warn, vha, 0x700b,
+ "No room on outstanding cmd array.\n");
+ goto queuing_error;
+ }
+
+ /* Prep command array. */
+ req->current_outstanding_cmd = handle;
+ req->outstanding_cmds[handle] = sp;
+ sp->handle = handle;
+ }
+
/* Prep packet */
req->cnt -= req_cnt;
pkt = req->ring_ptr;
@@ -2195,6 +2194,8 @@ skip_cmd_array:
pkt->handle = handle;
}
+ return pkt;
+
queuing_error:
qpair->tgt_counters.num_alloc_iocb_failed++;
return pkt;
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 3f3cb72e0c0c..d0389b20574d 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -523,18 +523,26 @@ static int sr_init_command(struct scsi_cmnd *SCpnt)
static int sr_block_open(struct block_device *bdev, fmode_t mode)
{
struct scsi_cd *cd;
+ struct scsi_device *sdev;
int ret = -ENXIO;
+ cd = scsi_cd_get(bdev->bd_disk);
+ if (!cd)
+ goto out;
+
+ sdev = cd->device;
+ scsi_autopm_get_device(sdev);
check_disk_change(bdev);
mutex_lock(&sr_mutex);
- cd = scsi_cd_get(bdev->bd_disk);
- if (cd) {
- ret = cdrom_open(&cd->cdi, bdev, mode);
- if (ret)
- scsi_cd_put(cd);
- }
+ ret = cdrom_open(&cd->cdi, bdev, mode);
mutex_unlock(&sr_mutex);
+
+ scsi_autopm_put_device(sdev);
+ if (ret)
+ scsi_cd_put(cd);
+
+out:
return ret;
}
@@ -562,6 +570,8 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
if (ret)
goto out;
+ scsi_autopm_get_device(sdev);
+
/*
* Send SCSI addressing ioctls directly to mid level, send other
* ioctls to cdrom/block level.
@@ -570,15 +580,18 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case SCSI_IOCTL_GET_IDLUN:
case SCSI_IOCTL_GET_BUS_NUMBER:
ret = scsi_ioctl(sdev, cmd, argp);
- goto out;
+ goto put;
}
ret = cdrom_ioctl(&cd->cdi, bdev, mode, cmd, arg);
if (ret != -ENOSYS)
- goto out;
+ goto put;
ret = scsi_ioctl(sdev, cmd, argp);
+put:
+ scsi_autopm_put_device(sdev);
+
out:
mutex_unlock(&sr_mutex);
return ret;
diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c
index 6fcdb90f616a..2826052a7e70 100644
--- a/drivers/tty/serial/8250/8250_dw.c
+++ b/drivers/tty/serial/8250/8250_dw.c
@@ -274,7 +274,7 @@ static void dw8250_set_termios(struct uart_port *p, struct ktermios *termios,
long rate;
int ret;
- if (IS_ERR(d->clk) || !old)
+ if (IS_ERR(d->clk))
goto out;
clk_disable_unprepare(d->clk);
@@ -680,6 +680,7 @@ static const struct acpi_device_id dw8250_acpi_match[] = {
{ "APMC0D08", 0},
{ "AMD0020", 0 },
{ "AMDI0020", 0 },
+ { "BRCM2032", 0 },
{ "HISI0031", 0 },
{ },
};
diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
index 38af306ca0e8..a951511f04cf 100644
--- a/drivers/tty/serial/8250/8250_exar.c
+++ b/drivers/tty/serial/8250/8250_exar.c
@@ -433,7 +433,11 @@ static irqreturn_t exar_misc_handler(int irq, void *data)
struct exar8250 *priv = data;
/* Clear all PCI interrupts by reading INT0. No effect on IIR */
- ioread8(priv->virt + UART_EXAR_INT0);
+ readb(priv->virt + UART_EXAR_INT0);
+
+ /* Clear INT0 for Expansion Interface slave ports, too */
+ if (priv->board->num_ports > 8)
+ readb(priv->virt + 0x2000 + UART_EXAR_INT0);
return IRQ_HANDLED;
}
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 8fbd5fbeb318..fdda268520d3 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -90,8 +90,7 @@ static const struct serial8250_config uart_config[] = {
.name = "16550A",
.fifo_size = 16,
.tx_loadsz = 16,
- .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
- UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
+ .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
.rxtrig_bytes = {1, 4, 8, 14},
.flags = UART_CAP_FIFO,
},
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 2058852a87fa..2fca8060c90b 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -196,6 +196,8 @@ static void option_instat_callback(struct urb *urb);
#define DELL_PRODUCT_5800_V2_MINICARD_VZW 0x8196 /* Novatel E362 */
#define DELL_PRODUCT_5804_MINICARD_ATT 0x819b /* Novatel E371 */
+#define DELL_PRODUCT_5821E 0x81d7
+
#define KYOCERA_VENDOR_ID 0x0c88
#define KYOCERA_PRODUCT_KPC650 0x17da
#define KYOCERA_PRODUCT_KPC680 0x180a
@@ -1030,6 +1032,8 @@ static const struct usb_device_id option_ids[] = {
{ USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, DELL_PRODUCT_5800_MINICARD_VZW, 0xff, 0xff, 0xff) },
{ USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, DELL_PRODUCT_5800_V2_MINICARD_VZW, 0xff, 0xff, 0xff) },
{ USB_DEVICE_AND_INTERFACE_INFO(DELL_VENDOR_ID, DELL_PRODUCT_5804_MINICARD_ATT, 0xff, 0xff, 0xff) },
+ { USB_DEVICE(DELL_VENDOR_ID, DELL_PRODUCT_5821E),
+ .driver_info = RSVD(0) | RSVD(1) | RSVD(6) },
{ USB_DEVICE(ANYDATA_VENDOR_ID, ANYDATA_PRODUCT_ADU_E100A) }, /* ADU-E100, ADU-310 */
{ USB_DEVICE(ANYDATA_VENDOR_ID, ANYDATA_PRODUCT_ADU_500A) },
{ USB_DEVICE(ANYDATA_VENDOR_ID, ANYDATA_PRODUCT_ADU_620UW) },
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 46dd09da2434..3bc2d6c28aa3 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -52,6 +52,8 @@ static const struct usb_device_id id_table[] = {
.driver_info = PL2303_QUIRK_ENDPOINT_HACK },
{ USB_DEVICE(ATEN_VENDOR_ID, ATEN_PRODUCT_UC485),
.driver_info = PL2303_QUIRK_ENDPOINT_HACK },
+ { USB_DEVICE(ATEN_VENDOR_ID, ATEN_PRODUCT_UC232B),
+ .driver_info = PL2303_QUIRK_ENDPOINT_HACK },
{ USB_DEVICE(ATEN_VENDOR_ID, ATEN_PRODUCT_ID2) },
{ USB_DEVICE(ATEN_VENDOR_ID2, ATEN_PRODUCT_ID) },
{ USB_DEVICE(ELCOM_VENDOR_ID, ELCOM_PRODUCT_ID) },
diff --git a/drivers/usb/serial/pl2303.h b/drivers/usb/serial/pl2303.h
index fcd72396a7b6..26965cc23c17 100644
--- a/drivers/usb/serial/pl2303.h
+++ b/drivers/usb/serial/pl2303.h
@@ -24,6 +24,7 @@
#define ATEN_VENDOR_ID2 0x0547
#define ATEN_PRODUCT_ID 0x2008
#define ATEN_PRODUCT_UC485 0x2021
+#define ATEN_PRODUCT_UC232B 0x2022
#define ATEN_PRODUCT_ID2 0x2118
#define IODATA_VENDOR_ID 0x04bb
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index d189f953c891..55956a638f5b 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -770,9 +770,9 @@ static void sierra_close(struct usb_serial_port *port)
kfree(urb->transfer_buffer);
usb_free_urb(urb);
usb_autopm_put_interface_async(serial->interface);
- spin_lock(&portdata->lock);
+ spin_lock_irq(&portdata->lock);
portdata->outstanding_urbs--;
- spin_unlock(&portdata->lock);
+ spin_unlock_irq(&portdata->lock);
}
sierra_stop_rx_urbs(port);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 9beefa6ed1ce..d1de2cb13fd6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1556,9 +1556,12 @@ int vhost_init_device_iotlb(struct vhost_dev *d, bool enabled)
d->iotlb = niotlb;
for (i = 0; i < d->nvqs; ++i) {
- mutex_lock(&d->vqs[i]->mutex);
- d->vqs[i]->iotlb = niotlb;
- mutex_unlock(&d->vqs[i]->mutex);
+ struct vhost_virtqueue *vq = d->vqs[i];
+
+ mutex_lock(&vq->mutex);
+ vq->iotlb = niotlb;
+ __vhost_vq_meta_reset(vq);
+ mutex_unlock(&vq->mutex);
}
vhost_umem_clean(oiotlb);
diff --git a/fs/dcache.c b/fs/dcache.c
index 2acfc69878f5..811114b229b1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -358,14 +358,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
__releases(dentry->d_inode->i_lock)
{
struct inode *inode = dentry->d_inode;
- bool hashed = !d_unhashed(dentry);
- if (hashed)
- raw_write_seqcount_begin(&dentry->d_seq);
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
- if (hashed)
- raw_write_seqcount_end(&dentry->d_seq);
+ raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
@@ -1954,10 +1951,12 @@ struct dentry *d_make_root(struct inode *root_inode)
if (root_inode) {
res = d_alloc_anon(root_inode->i_sb);
- if (res)
+ if (res) {
+ res->d_flags |= DCACHE_RCUACCESS;
d_instantiate(res, root_inode);
- else
+ } else {
iput(root_inode);
+ }
}
return res;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 5f75969adff1..51a1935060a9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -659,12 +659,21 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
return 0;
mnt = real_mount(bastard);
mnt_add_count(mnt, 1);
+ smp_mb(); // see mntput_no_expire()
if (likely(!read_seqretry(&mount_lock, seq)))
return 0;
if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
mnt_add_count(mnt, -1);
return 1;
}
+ lock_mount_hash();
+ if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
+ mnt_add_count(mnt, -1);
+ unlock_mount_hash();
+ return 1;
+ }
+ unlock_mount_hash();
+ /* caller will mntput() */
return -1;
}
@@ -1195,12 +1204,27 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
static void mntput_no_expire(struct mount *mnt)
{
rcu_read_lock();
- mnt_add_count(mnt, -1);
- if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
+ if (likely(READ_ONCE(mnt->mnt_ns))) {
+ /*
+ * Since we don't do lock_mount_hash() here,
+ * ->mnt_ns can change under us. However, if it's
+ * non-NULL, then there's a reference that won't
+ * be dropped until after an RCU delay done after
+ * turning ->mnt_ns NULL. So if we observe it
+ * non-NULL under rcu_read_lock(), the reference
+ * we are dropping is not the final one.
+ */
+ mnt_add_count(mnt, -1);
rcu_read_unlock();
return;
}
lock_mount_hash();
+ /*
+ * make sure that if __legitimize_mnt() has not seen us grab
+ * mount_lock, we'll see their refcount increment here.
+ */
+ smp_mb();
+ mnt_add_count(mnt, -1);
if (mnt_get_count(mnt)) {
rcu_read_unlock();
unlock_mount_hash();
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index f59639afaa39..a75cb371cd19 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1019,8 +1019,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud);
int pmd_clear_huge(pmd_t *pmd);
-int pud_free_pmd_page(pud_t *pud);
-int pmd_free_pte_page(pmd_t *pmd);
+int pud_free_pmd_page(pud_t *pud, unsigned long addr);
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
@@ -1046,11 +1046,11 @@ static inline int pmd_clear_huge(pmd_t *pmd)
{
return 0;
}
-static inline int pud_free_pmd_page(pud_t *pud)
+static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
return 0;
}
-static inline int pmd_free_pte_page(pmd_t *pmd)
+static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
return 0;
}
@@ -1083,6 +1083,18 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
static inline void init_espfix_bsp(void) { }
#endif
+#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
+static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
+{
+ return true;
+}
+
+static inline bool arch_has_pfn_modify_check(void)
+{
+ return false;
+}
+#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */
+
#endif /* !__ASSEMBLY__ */
#ifndef io_remap_pfn_range
diff --git a/include/crypto/vmac.h b/include/crypto/vmac.h
deleted file mode 100644
index 6b700c7b2fe1..000000000000
--- a/include/crypto/vmac.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Modified to interface to the Linux kernel
- * Copyright (c) 2009, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- */
-
-#ifndef __CRYPTO_VMAC_H
-#define __CRYPTO_VMAC_H
-
-/* --------------------------------------------------------------------------
- * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai.
- * This implementation is herby placed in the public domain.
- * The authors offers no warranty. Use at your own risk.
- * Please send bug reports to the authors.
- * Last modified: 17 APR 08, 1700 PDT
- * ----------------------------------------------------------------------- */
-
-/*
- * User definable settings.
- */
-#define VMAC_TAG_LEN 64
-#define VMAC_KEY_SIZE 128/* Must be 128, 192 or 256 */
-#define VMAC_KEY_LEN (VMAC_KEY_SIZE/8)
-#define VMAC_NHBYTES 128/* Must 2^i for any 3 < i < 13 Standard = 128*/
-
-/*
- * This implementation uses u32 and u64 as names for unsigned 32-
- * and 64-bit integer types. These are defined in C99 stdint.h. The
- * following may need adaptation if you are not running a C99 or
- * Microsoft C environment.
- */
-struct vmac_ctx {
- u64 nhkey[(VMAC_NHBYTES/8)+2*(VMAC_TAG_LEN/64-1)];
- u64 polykey[2*VMAC_TAG_LEN/64];
- u64 l3key[2*VMAC_TAG_LEN/64];
- u64 polytmp[2*VMAC_TAG_LEN/64];
- u64 cached_nonce[2];
- u64 cached_aes[2];
- int first_block_processed;
-};
-
-typedef u64 vmac_t;
-
-struct vmac_ctx_t {
- struct crypto_cipher *child;
- struct vmac_ctx __vmac_ctx;
- u8 partial[VMAC_NHBYTES]; /* partial block */
- int partial_size; /* size of the partial block */
-};
-
-#endif /* __CRYPTO_VMAC_H */
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index a97a63eef59f..45789a892c41 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -30,7 +30,7 @@ struct cpu {
};
extern void boot_cpu_init(void);
-extern void boot_cpu_state_init(void);
+extern void boot_cpu_hotplug_init(void);
extern void cpu_init(void);
extern void trap_init(void);
@@ -55,6 +55,8 @@ extern ssize_t cpu_show_spectre_v2(struct device *dev,
struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_l1tf(struct device *dev,
+ struct device_attribute *attr, char *buf);
extern __printf(4, 5)
struct device *cpu_device_create(struct device *parent, void *drvdata,
@@ -166,4 +168,23 @@ void cpuhp_report_idle_dead(void);
static inline void cpuhp_report_idle_dead(void) { }
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+enum cpuhp_smt_control {
+ CPU_SMT_ENABLED,
+ CPU_SMT_DISABLED,
+ CPU_SMT_FORCE_DISABLED,
+ CPU_SMT_NOT_SUPPORTED,
+};
+
+#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
+extern enum cpuhp_smt_control cpu_smt_control;
+extern void cpu_smt_disable(bool force);
+extern void cpu_smt_check_topology_early(void);
+extern void cpu_smt_check_topology(void);
+#else
+# define cpu_smt_control (CPU_SMT_ENABLED)
+static inline void cpu_smt_disable(bool force) { }
+static inline void cpu_smt_check_topology_early(void) { }
+static inline void cpu_smt_check_topology(void) { }
+#endif
+
#endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 06bd7b096167..e06febf62978 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -10,5 +10,7 @@ extern spinlock_t swap_lock;
extern struct plist_head swap_active_head;
extern struct swap_info_struct *swap_info[];
extern int try_to_unuse(unsigned int, bool, unsigned long);
+extern unsigned long generic_max_swapfile_size(void);
+extern unsigned long max_swapfile_size(void);
#endif /* _LINUX_SWAPFILE_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 9324ac2d9ff2..43913ae79f64 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -64,7 +64,8 @@ struct vsock_sock {
struct list_head pending_links;
struct list_head accept_queue;
bool rejected;
- struct delayed_work dwork;
+ struct delayed_work connect_work;
+ struct delayed_work pending_work;
struct delayed_work close_work;
bool close_work_scheduled;
u32 peer_shutdown;
@@ -77,7 +78,6 @@ struct vsock_sock {
s64 vsock_stream_has_data(struct vsock_sock *vsk);
s64 vsock_stream_has_space(struct vsock_sock *vsk);
-void vsock_pending_work(struct work_struct *work);
struct sock *__vsock_create(struct net *net,
struct socket *sock,
struct sock *parent,
diff --git a/include/net/llc.h b/include/net/llc.h
index dc35f25eb679..890a87318014 100644
--- a/include/net/llc.h
+++ b/include/net/llc.h
@@ -116,6 +116,11 @@ static inline void llc_sap_hold(struct llc_sap *sap)
refcount_inc(&sap->refcnt);
}
+static inline bool llc_sap_hold_safe(struct llc_sap *sap)
+{
+ return refcount_inc_not_zero(&sap->refcnt);
+}
+
void llc_sap_close(struct llc_sap *sap);
static inline void llc_sap_put(struct llc_sap *sap)
diff --git a/init/main.c b/init/main.c
index 3b4ada11ed52..5e13c544bbf4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -561,8 +561,8 @@ asmlinkage __visible void __init start_kernel(void)
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
- boot_cpu_state_init();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
+ boot_cpu_hotplug_init();
build_all_zonelists(NULL);
page_alloc_init();
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index fc7ee4357381..70edc41a88d5 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -947,12 +947,12 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
while (msg_data_left(msg)) {
- struct sk_msg_buff *m;
+ struct sk_msg_buff *m = NULL;
bool enospc = false;
int copy;
if (sk->sk_err) {
- err = sk->sk_err;
+ err = -sk->sk_err;
goto out_err;
}
@@ -1015,8 +1015,11 @@ wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
err = sk_stream_wait_memory(sk, &timeo);
- if (err)
+ if (err) {
+ if (m && m != psock->cork)
+ free_start_sg(sk, m);
goto out_err;
+ }
}
out_err:
if (err < 0)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0db8938fbb23..f80afc674f02 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -60,6 +60,7 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
+ bool booted_once;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
@@ -342,6 +343,85 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
+#ifdef CONFIG_HOTPLUG_SMT
+enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
+EXPORT_SYMBOL_GPL(cpu_smt_control);
+
+static bool cpu_smt_available __read_mostly;
+
+void __init cpu_smt_disable(bool force)
+{
+ if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
+ cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ return;
+
+ if (force) {
+ pr_info("SMT: Force disabled\n");
+ cpu_smt_control = CPU_SMT_FORCE_DISABLED;
+ } else {
+ cpu_smt_control = CPU_SMT_DISABLED;
+ }
+}
+
+/*
+ * The decision whether SMT is supported can only be done after the full
+ * CPU identification. Called from architecture code before non boot CPUs
+ * are brought up.
+ */
+void __init cpu_smt_check_topology_early(void)
+{
+ if (!topology_smt_supported())
+ cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+}
+
+/*
+ * If SMT was disabled by BIOS, detect it here, after the CPUs have been
+ * brought online. This ensures the smt/l1tf sysfs entries are consistent
+ * with reality. cpu_smt_available is set to true during the bringup of non
+ * boot CPUs when a SMT sibling is detected. Note, this may overwrite
+ * cpu_smt_control's previous setting.
+ */
+void __init cpu_smt_check_topology(void)
+{
+ if (!cpu_smt_available)
+ cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+}
+
+static int __init smt_cmdline_disable(char *str)
+{
+ cpu_smt_disable(str && !strcmp(str, "force"));
+ return 0;
+}
+early_param("nosmt", smt_cmdline_disable);
+
+static inline bool cpu_smt_allowed(unsigned int cpu)
+{
+ if (topology_is_primary_thread(cpu))
+ return true;
+
+ /*
+ * If the CPU is not a 'primary' thread and the booted_once bit is
+ * set then the processor has SMT support. Store this information
+ * for the late check of SMT support in cpu_smt_check_topology().
+ */
+ if (per_cpu(cpuhp_state, cpu).booted_once)
+ cpu_smt_available = true;
+
+ if (cpu_smt_control == CPU_SMT_ENABLED)
+ return true;
+
+ /*
+ * On x86 it's required to boot all logical CPUs at least once so
+ * that the init code can get a chance to set CR4.MCE on each
+ * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any
+ * core will shutdown the machine.
+ */
+ return !per_cpu(cpuhp_state, cpu).booted_once;
+}
+#else
+static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
+#endif
+
static inline enum cpuhp_state
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
@@ -422,6 +502,16 @@ static int bringup_wait_for_ap(unsigned int cpu)
stop_machine_unpark(cpu);
kthread_unpark(st->thread);
+ /*
+ * SMT soft disabling on X86 requires to bring the CPU out of the
+ * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
+ * CPU marked itself as booted_once in cpu_notify_starting() so the
+ * cpu_smt_allowed() check will now return false if this is not the
+ * primary sibling.
+ */
+ if (!cpu_smt_allowed(cpu))
+ return -ECANCELED;
+
if (st->target <= CPUHP_AP_ONLINE_IDLE)
return 0;
@@ -754,7 +844,6 @@ static int takedown_cpu(unsigned int cpu)
/* Park the smpboot threads */
kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
- smpboot_park_threads(cpu);
/*
* Prevent irq alloc/free while the dying cpu reorganizes the
@@ -907,20 +996,19 @@ out:
return ret;
}
+static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
+{
+ if (cpu_hotplug_disabled)
+ return -EBUSY;
+ return _cpu_down(cpu, 0, target);
+}
+
static int do_cpu_down(unsigned int cpu, enum cpuhp_state target)
{
int err;
cpu_maps_update_begin();
-
- if (cpu_hotplug_disabled) {
- err = -EBUSY;
- goto out;
- }
-
- err = _cpu_down(cpu, 0, target);
-
-out:
+ err = cpu_down_maps_locked(cpu, target);
cpu_maps_update_done();
return err;
}
@@ -949,6 +1037,7 @@ void notify_cpu_starting(unsigned int cpu)
int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
+ st->booted_once = true;
while (st->state < target) {
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
@@ -1058,6 +1147,10 @@ static int do_cpu_up(unsigned int cpu, enum cpuhp_state target)
err = -EBUSY;
goto out;
}
+ if (!cpu_smt_allowed(cpu)) {
+ err = -EPERM;
+ goto out;
+ }
err = _cpu_up(cpu, 0, target);
out:
@@ -1332,7 +1425,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_AP_SMPBOOT_THREADS] = {
.name = "smpboot/threads:online",
.startup.single = smpboot_unpark_threads,
- .teardown.single = NULL,
+ .teardown.single = smpboot_park_threads,
},
[CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
.name = "irq/affinity:online",
@@ -1906,10 +1999,172 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
NULL
};
+#ifdef CONFIG_HOTPLUG_SMT
+
+static const char *smt_states[] = {
+ [CPU_SMT_ENABLED] = "on",
+ [CPU_SMT_DISABLED] = "off",
+ [CPU_SMT_FORCE_DISABLED] = "forceoff",
+ [CPU_SMT_NOT_SUPPORTED] = "notsupported",
+};
+
+static ssize_t
+show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]);
+}
+
+static void cpuhp_offline_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = true;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+}
+
+static void cpuhp_online_cpu_device(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ dev->offline = false;
+ /* Tell user space about the state change */
+ kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+}
+
+static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ for_each_online_cpu(cpu) {
+ if (topology_is_primary_thread(cpu))
+ continue;
+ ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
+ if (ret)
+ break;
+ /*
+ * As this needs to hold the cpu maps lock it's impossible
+ * to call device_offline() because that ends up calling
+ * cpu_down() which takes cpu maps lock. cpu maps lock
+ * needs to be held as this might race against in kernel
+ * abusers of the hotplug machinery (thermal management).
+ *
+ * So nothing would update device:offline state. That would
+ * leave the sysfs entry stale and prevent onlining after
+ * smt control has been changed to 'off' again. This is
+ * called under the sysfs hotplug lock, so it is properly
+ * serialized against the regular offline usage.
+ */
+ cpuhp_offline_cpu_device(cpu);
+ }
+ if (!ret)
+ cpu_smt_control = ctrlval;
+ cpu_maps_update_done();
+ return ret;
+}
+
+static int cpuhp_smt_enable(void)
+{
+ int cpu, ret = 0;
+
+ cpu_maps_update_begin();
+ cpu_smt_control = CPU_SMT_ENABLED;
+ for_each_present_cpu(cpu) {
+ /* Skip online CPUs and CPUs on offline nodes */
+ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
+ continue;
+ ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
+ if (ret)
+ break;
+ /* See comment in cpuhp_smt_disable() */
+ cpuhp_online_cpu_device(cpu);
+ }
+ cpu_maps_update_done();
+ return ret;
+}
+
+static ssize_t
+store_smt_control(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int ctrlval, ret;
+
+ if (sysfs_streq(buf, "on"))
+ ctrlval = CPU_SMT_ENABLED;
+ else if (sysfs_streq(buf, "off"))
+ ctrlval = CPU_SMT_DISABLED;
+ else if (sysfs_streq(buf, "forceoff"))
+ ctrlval = CPU_SMT_FORCE_DISABLED;
+ else
+ return -EINVAL;
+
+ if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
+ return -EPERM;
+
+ if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ return -ENODEV;
+
+ ret = lock_device_hotplug_sysfs();
+ if (ret)
+ return ret;
+
+ if (ctrlval != cpu_smt_control) {
+ switch (ctrlval) {
+ case CPU_SMT_ENABLED:
+ ret = cpuhp_smt_enable();
+ break;
+ case CPU_SMT_DISABLED:
+ case CPU_SMT_FORCE_DISABLED:
+ ret = cpuhp_smt_disable(ctrlval);
+ break;
+ }
+ }
+
+ unlock_device_hotplug();
+ return ret ? ret : count;
+}
+static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
+
+static ssize_t
+show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ bool active = topology_max_smt_threads() > 1;
+
+ return snprintf(buf, PAGE_SIZE - 2, "%d\n", active);
+}
+static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
+
+static struct attribute *cpuhp_smt_attrs[] = {
+ &dev_attr_control.attr,
+ &dev_attr_active.attr,
+ NULL
+};
+
+static const struct attribute_group cpuhp_smt_attr_group = {
+ .attrs = cpuhp_smt_attrs,
+ .name = "smt",
+ NULL
+};
+
+static int __init cpu_smt_state_init(void)
+{
+ return sysfs_create_group(&cpu_subsys.dev_root->kobj,
+ &cpuhp_smt_attr_group);
+}
+
+#else
+static inline int cpu_smt_state_init(void) { return 0; }
+#endif
+
static int __init cpuhp_sysfs_init(void)
{
int cpu, ret;
+ ret = cpu_smt_state_init();
+ if (ret)
+ return ret;
+
ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
&cpuhp_cpu_root_attr_group);
if (ret)
@@ -2010,7 +2265,10 @@ void __init boot_cpu_init(void)
/*
* Must be called _AFTER_ setting up the per_cpu areas
*/
-void __init boot_cpu_state_init(void)
+void __init boot_cpu_hotplug_init(void)
{
- per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE;
+#ifdef CONFIG_SMP
+ this_cpu_write(cpuhp_state.booted_once, true);
+#endif
+ this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 211890edf37e..ec945451b9ef 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5788,6 +5788,18 @@ int sched_cpu_activate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * The sched_smt_present static key needs to be evaluated on every
+ * hotplug event because at boot time SMT might be disabled when
+ * the number of booted CPUs is limited.
+ *
+ * If then later a sibling gets hotplugged, then the key would stay
+ * off and SMT scheduling would never be functional.
+ */
+ if (cpumask_weight(cpu_smt_mask(cpu)) > 1)
+ static_branch_enable_cpuslocked(&sched_smt_present);
+#endif
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
@@ -5885,22 +5897,6 @@ int sched_cpu_dying(unsigned int cpu)
}
#endif
-#ifdef CONFIG_SCHED_SMT
-DEFINE_STATIC_KEY_FALSE(sched_smt_present);
-
-static void sched_init_smt(void)
-{
- /*
- * We've enumerated all CPUs and will assume that if any CPU
- * has SMT siblings, CPU0 will too.
- */
- if (cpumask_weight(cpu_smt_mask(0)) > 1)
- static_branch_enable(&sched_smt_present);
-}
-#else
-static inline void sched_init_smt(void) { }
-#endif
-
void __init sched_init_smp(void)
{
sched_init_numa();
@@ -5922,8 +5918,6 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
- sched_init_smt();
-
sched_smp_initialized = true;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fbfc3f1d368a..8b50eea4b607 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2090,8 +2090,14 @@ retry:
sub_rq_bw(&next_task->dl, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
add_rq_bw(&next_task->dl, &later_rq->dl);
+
+ /*
+ * Update the later_rq clock here, because the clock is used
+ * by the cpufreq_update_util() inside __add_running_bw().
+ */
+ update_rq_clock(later_rq);
add_running_bw(&next_task->dl, &later_rq->dl);
- activate_task(later_rq, next_task, 0);
+ activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
ret = 1;
resched_curr(later_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79f574dba096..183068d22849 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6183,6 +6183,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
}
#ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
static inline void set_idle_cores(int cpu, int val)
{
diff --git a/kernel/smp.c b/kernel/smp.c
index 084c8b3a2681..d86eec5f51c1 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -584,6 +584,8 @@ void __init smp_init(void)
num_nodes, (num_nodes > 1 ? "s" : ""),
num_cpus, (num_cpus > 1 ? "s" : ""));
+ /* Final decision about SMT support */
+ cpu_smt_check_topology();
/* Any cleanup work */
smp_cpus_done(setup_max_cpus);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ce4fb0e12504..7a076b6c537a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -79,12 +79,16 @@ static void wakeup_softirqd(void)
/*
* If ksoftirqd is scheduled, we do not want to process pending softirqs
- * right now. Let ksoftirqd handle this at its own rate, to get fairness.
+ * right now. Let ksoftirqd handle this at its own rate, to get fairness,
+ * unless we're doing some of the synchronous softirqs.
*/
-static bool ksoftirqd_running(void)
+#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
+static bool ksoftirqd_running(unsigned long pending)
{
struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+ if (pending & SOFTIRQ_NOW_MASK)
+ return false;
return tsk && (tsk->state == TASK_RUNNING);
}
@@ -329,7 +333,7 @@ asmlinkage __visible void do_softirq(void)
pending = local_softirq_pending();
- if (pending && !ksoftirqd_running())
+ if (pending && !ksoftirqd_running(pending))
do_softirq_own_stack();
local_irq_restore(flags);
@@ -356,7 +360,7 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
- if (ksoftirqd_running())
+ if (ksoftirqd_running(local_softirq_pending()))
return;
if (!force_irqthreads) {
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 1ff523dae6e2..e190d1ef3a23 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -260,6 +260,15 @@ retry:
err = 0;
__cpu_stop_queue_work(stopper1, work1, &wakeq);
__cpu_stop_queue_work(stopper2, work2, &wakeq);
+ /*
+ * The waking up of stopper threads has to happen
+ * in the same scheduling context as the queueing.
+ * Otherwise, there is a possibility of one of the
+ * above stoppers being woken up by another CPU,
+ * and preempting us. This will cause us to n ot
+ * wake up the other stopper forever.
+ */
+ preempt_disable();
unlock:
raw_spin_unlock(&stopper2->lock);
raw_spin_unlock_irq(&stopper1->lock);
@@ -271,7 +280,6 @@ unlock:
}
if (!err) {
- preempt_disable();
wake_up_q(&wakeq);
preempt_enable();
}
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 54e5bbaa3200..517f5853ffed 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -92,7 +92,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
if (ioremap_pmd_enabled() &&
((next - addr) == PMD_SIZE) &&
IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
- pmd_free_pte_page(pmd)) {
+ pmd_free_pte_page(pmd, addr)) {
if (pmd_set_huge(pmd, phys_addr + addr, prot))
continue;
}
@@ -119,7 +119,7 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
if (ioremap_pud_enabled() &&
((next - addr) == PUD_SIZE) &&
IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
- pud_free_pmd_page(pud)) {
+ pud_free_pmd_page(pud, addr)) {
if (pud_set_huge(pud, phys_addr + addr, prot))
continue;
}
diff --git a/mm/memory.c b/mm/memory.c
index 01f5464e0fd2..fe497cecd2ab 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1891,6 +1891,9 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
+ if (!pfn_modify_allowed(pfn, pgprot))
+ return -EACCES;
+
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
@@ -1926,6 +1929,9 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, pfn);
+ if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+ return -EACCES;
+
/*
* If we don't have pte special, then we have to use the pfn_valid()
* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
@@ -1973,6 +1979,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
{
pte_t *pte;
spinlock_t *ptl;
+ int err = 0;
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
@@ -1980,12 +1987,16 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
arch_enter_lazy_mmu_mode();
do {
BUG_ON(!pte_none(*pte));
+ if (!pfn_modify_allowed(pfn, prot)) {
+ err = -EACCES;
+ break;
+ }
set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
- return 0;
+ return err;
}
static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1994,6 +2005,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
{
pmd_t *pmd;
unsigned long next;
+ int err;
pfn -= addr >> PAGE_SHIFT;
pmd = pmd_alloc(mm, pud, addr);
@@ -2002,9 +2014,10 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
- if (remap_pte_range(mm, pmd, addr, next,
- pfn + (addr >> PAGE_SHIFT), prot))
- return -ENOMEM;
+ err = remap_pte_range(mm, pmd, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
} while (pmd++, addr = next, addr != end);
return 0;
}
@@ -2015,6 +2028,7 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
{
pud_t *pud;
unsigned long next;
+ int err;
pfn -= addr >> PAGE_SHIFT;
pud = pud_alloc(mm, p4d, addr);
@@ -2022,9 +2036,10 @@ static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (remap_pmd_range(mm, pud, addr, next,
- pfn + (addr >> PAGE_SHIFT), prot))
- return -ENOMEM;
+ err = remap_pmd_range(mm, pud, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
} while (pud++, addr = next, addr != end);
return 0;
}
@@ -2035,6 +2050,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
{
p4d_t *p4d;
unsigned long next;
+ int err;
pfn -= addr >> PAGE_SHIFT;
p4d = p4d_alloc(mm, pgd, addr);
@@ -2042,9 +2058,10 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (remap_pud_range(mm, p4d, addr, next,
- pfn + (addr >> PAGE_SHIFT), prot))
- return -ENOMEM;
+ err = remap_pud_range(mm, p4d, addr, next,
+ pfn + (addr >> PAGE_SHIFT), prot);
+ if (err)
+ return err;
} while (p4d++, addr = next, addr != end);
return 0;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 625608bc8962..6d331620b9e5 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -306,6 +306,42 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
return pages;
}
+static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ 0 : -EACCES;
+}
+
+static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
+ 0 : -EACCES;
+}
+
+static int prot_none_test(unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ return 0;
+}
+
+static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned long newflags)
+{
+ pgprot_t new_pgprot = vm_get_page_prot(newflags);
+ struct mm_walk prot_none_walk = {
+ .pte_entry = prot_none_pte_entry,
+ .hugetlb_entry = prot_none_hugetlb_entry,
+ .test_walk = prot_none_test,
+ .mm = current->mm,
+ .private = &new_pgprot,
+ };
+
+ return walk_page_range(start, end, &prot_none_walk);
+}
+
int
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
unsigned long start, unsigned long end, unsigned long newflags)
@@ -324,6 +360,19 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
}
/*
+ * Do PROT_NONE PFN permission checks here when we can still
+ * bail out without undoing a lot of state. This is a rather
+ * uncommon case, so doesn't need to be very optimized.
+ */
+ if (arch_has_pfn_modify_check() &&
+ (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
+ error = prot_none_walk(vma, start, end, newflags);
+ if (error)
+ return error;
+ }
+
+ /*
* If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we
* make it unwritable again. hugetlb mapping were accounted for
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 78a015fcec3b..6ac2757d5997 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2909,6 +2909,35 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
return 0;
}
+
+/*
+ * Find out how many pages are allowed for a single swap device. There
+ * are two limiting factors:
+ * 1) the number of bits for the swap offset in the swp_entry_t type, and
+ * 2) the number of bits in the swap pte, as defined by the different
+ * architectures.
+ *
+ * In order to find the largest possible bit mask, a swap entry with
+ * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
+ * decoded to a swp_entry_t again, and finally the swap offset is
+ * extracted.
+ *
+ * This will mask all the bits from the initial ~0UL mask that can't
+ * be encoded in either the swp_entry_t or the architecture definition
+ * of a swap pte.
+ */
+unsigned long generic_max_swapfile_size(void)
+{
+ return swp_offset(pte_to_swp_entry(
+ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+}
+
+/* Can be overridden by an architecture for additional checks. */
+__weak unsigned long max_swapfile_size(void)
+{
+ return generic_max_swapfile_size();
+}
+
static unsigned long read_swap_header(struct swap_info_struct *p,
union swap_header *swap_header,
struct inode *inode)
@@ -2944,22 +2973,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
p->cluster_next = 1;
p->cluster_nr = 0;
- /*
- * Find out how many pages are allowed for a single swap
- * device. There are two limiting factors: 1) the number
- * of bits for the swap offset in the swp_entry_t type, and
- * 2) the number of bits in the swap pte as defined by the
- * different architectures. In order to find the
- * largest possible bit mask, a swap entry with swap type 0
- * and swap offset ~0UL is created, encoded to a swap pte,
- * decoded to a swp_entry_t again, and finally the swap
- * offset is extracted. This will mask all the bits from
- * the initial ~0UL mask that can't be encoded in either
- * the swp_entry_t or the architecture definition of a
- * swap pte.
- */
- maxpages = swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ maxpages = max_swapfile_size();
last_page = swap_header->info.last_page;
if (!last_page) {
pr_warn("Empty swap-file\n");
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 1036e4fa1ea2..3bba8f4b08a9 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -431,8 +431,8 @@ static void hidp_del_timer(struct hidp_session *session)
del_timer(&session->timer);
}
-static void hidp_process_report(struct hidp_session *session,
- int type, const u8 *data, int len, int intr)
+static void hidp_process_report(struct hidp_session *session, int type,
+ const u8 *data, unsigned int len, int intr)
{
if (len > HID_MAX_BUFFER_SIZE)
len = HID_MAX_BUFFER_SIZE;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 413b8ee49fec..8f0f9279eac9 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -393,7 +393,8 @@ static void sco_sock_cleanup_listen(struct sock *parent)
*/
static void sco_sock_kill(struct sock *sk)
{
- if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
+ if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket ||
+ sock_flag(sk, SOCK_DEAD))
return;
BT_DBG("sk %p state %d", sk, sk->sk_state);
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index c37b5be7c5e4..3312a5849a97 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -10,6 +10,7 @@
#include <linux/kernel.h>
#include <linux/tcp.h>
#include <linux/workqueue.h>
+#include <linux/nospec.h>
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
@@ -218,6 +219,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
if (req->sdiag_family >= AF_MAX)
return -EINVAL;
+ req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);
if (sock_diag_handlers[req->sdiag_family] == NULL)
sock_load_diag_module(req->sdiag_family, 0);
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 385f153fe031..33c5b1c88be2 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -228,14 +228,16 @@ static void ccid2_cwnd_restart(struct sock *sk, const u32 now)
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
u32 cwnd = hc->tx_cwnd, restart_cwnd,
iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache);
+ s32 delta = now - hc->tx_lsndtime;
hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2));
/* don't reduce cwnd below the initial window (IW) */
restart_cwnd = min(cwnd, iwnd);
- cwnd >>= (now - hc->tx_lsndtime) / hc->tx_rto;
- hc->tx_cwnd = max(cwnd, restart_cwnd);
+ while ((delta -= hc->tx_rto) >= 0 && cwnd > restart_cwnd)
+ cwnd >>= 1;
+ hc->tx_cwnd = max(cwnd, restart_cwnd);
hc->tx_cwnd_stamp = now;
hc->tx_cwnd_used = 0;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3f091ccad9af..f38cb21d773d 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -438,7 +438,8 @@ static int __net_init vti_init_net(struct net *net)
if (err)
return err;
itn = net_generic(net, vti_net_id);
- vti_fb_tunnel_init(itn->fb_tunnel_dev);
+ if (itn->fb_tunnel_dev)
+ vti_fb_tunnel_init(itn->fb_tunnel_dev);
return 0;
}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 00e138a44cbb..1cc9650af9fb 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1133,12 +1133,8 @@ route_lookup:
max_headroom += 8;
mtu -= 8;
}
- if (skb->protocol == htons(ETH_P_IPV6)) {
- if (mtu < IPV6_MIN_MTU)
- mtu = IPV6_MIN_MTU;
- } else if (mtu < 576) {
- mtu = 576;
- }
+ mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ?
+ IPV6_MIN_MTU : IPV4_MIN_MTU);
skb_dst_update_pmtu(skb, mtu);
if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) {
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 40261cb68e83..8aaf8157da2b 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1110,7 +1110,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
/* Get routing info from the tunnel socket */
skb_dst_drop(skb);
- skb_dst_set(skb, dst_clone(__sk_dst_check(sk, 0)));
+ skb_dst_set(skb, sk_dst_check(sk, 0));
inet = inet_sk(sk);
fl = &inet->cork.fl;
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index 89041260784c..260b3dc1b4a2 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -73,8 +73,8 @@ struct llc_sap *llc_sap_find(unsigned char sap_value)
rcu_read_lock_bh();
sap = __llc_sap_find(sap_value);
- if (sap)
- llc_sap_hold(sap);
+ if (!sap || !llc_sap_hold_safe(sap))
+ sap = NULL;
rcu_read_unlock_bh();
return sap;
}
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 19975d2ca9a2..5da2d3379a57 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -104,9 +104,9 @@ struct rxrpc_net {
#define RXRPC_KEEPALIVE_TIME 20 /* NAT keepalive time in seconds */
u8 peer_keepalive_cursor;
- ktime_t peer_keepalive_base;
- struct hlist_head peer_keepalive[RXRPC_KEEPALIVE_TIME + 1];
- struct hlist_head peer_keepalive_new;
+ time64_t peer_keepalive_base;
+ struct list_head peer_keepalive[32];
+ struct list_head peer_keepalive_new;
struct timer_list peer_keepalive_timer;
struct work_struct peer_keepalive_work;
};
@@ -295,7 +295,7 @@ struct rxrpc_peer {
struct hlist_head error_targets; /* targets for net error distribution */
struct work_struct error_distributor;
struct rb_root service_conns; /* Service connections */
- struct hlist_node keepalive_link; /* Link in net->peer_keepalive[] */
+ struct list_head keepalive_link; /* Link in net->peer_keepalive[] */
time64_t last_tx_at; /* Last time packet sent here */
seqlock_t service_conn_lock;
spinlock_t lock; /* access lock */
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 8229a52c2acd..3fde001fcc39 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -136,7 +136,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
}
ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len);
- conn->params.peer->last_tx_at = ktime_get_real();
+ conn->params.peer->last_tx_at = ktime_get_seconds();
if (ret < 0)
trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
rxrpc_tx_fail_call_final_resend);
@@ -245,7 +245,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
return -EAGAIN;
}
- conn->params.peer->last_tx_at = ktime_get_real();
+ conn->params.peer->last_tx_at = ktime_get_seconds();
_leave(" = 0");
return 0;
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index c7a023fb22d0..48fb8754c387 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -85,12 +85,12 @@ static __net_init int rxrpc_init_net(struct net *net)
hash_init(rxnet->peer_hash);
spin_lock_init(&rxnet->peer_hash_lock);
for (i = 0; i < ARRAY_SIZE(rxnet->peer_keepalive); i++)
- INIT_HLIST_HEAD(&rxnet->peer_keepalive[i]);
- INIT_HLIST_HEAD(&rxnet->peer_keepalive_new);
+ INIT_LIST_HEAD(&rxnet->peer_keepalive[i]);
+ INIT_LIST_HEAD(&rxnet->peer_keepalive_new);
timer_setup(&rxnet->peer_keepalive_timer,
rxrpc_peer_keepalive_timeout, 0);
INIT_WORK(&rxnet->peer_keepalive_work, rxrpc_peer_keepalive_worker);
- rxnet->peer_keepalive_base = ktime_add(ktime_get_real(), NSEC_PER_SEC);
+ rxnet->peer_keepalive_base = ktime_get_seconds();
ret = -ENOMEM;
rxnet->proc_net = proc_net_mkdir(net, "rxrpc", net->proc_net);
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index f03de1c59ba3..4774c8f5634d 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -209,7 +209,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
now = ktime_get_real();
if (ping)
call->ping_time = now;
- conn->params.peer->last_tx_at = ktime_get_real();
+ conn->params.peer->last_tx_at = ktime_get_seconds();
if (ret < 0)
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
rxrpc_tx_fail_call_ack);
@@ -296,7 +296,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
ret = kernel_sendmsg(conn->params.local->socket,
&msg, iov, 1, sizeof(pkt));
- conn->params.peer->last_tx_at = ktime_get_real();
+ conn->params.peer->last_tx_at = ktime_get_seconds();
if (ret < 0)
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
rxrpc_tx_fail_call_abort);
@@ -391,7 +391,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
* message and update the peer record
*/
ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
- conn->params.peer->last_tx_at = ktime_get_real();
+ conn->params.peer->last_tx_at = ktime_get_seconds();
up_read(&conn->params.local->defrag_sem);
if (ret < 0)
@@ -457,7 +457,7 @@ send_fragmentable:
if (ret == 0) {
ret = kernel_sendmsg(conn->params.local->socket, &msg,
iov, 2, len);
- conn->params.peer->last_tx_at = ktime_get_real();
+ conn->params.peer->last_tx_at = ktime_get_seconds();
opt = IP_PMTUDISC_DO;
kernel_setsockopt(conn->params.local->socket, SOL_IP,
@@ -475,7 +475,7 @@ send_fragmentable:
if (ret == 0) {
ret = kernel_sendmsg(conn->params.local->socket, &msg,
iov, 2, len);
- conn->params.peer->last_tx_at = ktime_get_real();
+ conn->params.peer->last_tx_at = ktime_get_seconds();
opt = IPV6_PMTUDISC_DO;
kernel_setsockopt(conn->params.local->socket,
@@ -599,6 +599,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer)
trace_rxrpc_tx_fail(peer->debug_id, 0, ret,
rxrpc_tx_fail_version_keepalive);
- peer->last_tx_at = ktime_get_real();
+ peer->last_tx_at = ktime_get_seconds();
_leave("");
}
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 0ed8b651cec2..4f9da2f51c69 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -350,97 +350,117 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
}
/*
- * Perform keep-alive pings with VERSION packets to keep any NAT alive.
+ * Perform keep-alive pings.
*/
-void rxrpc_peer_keepalive_worker(struct work_struct *work)
+static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet,
+ struct list_head *collector,
+ time64_t base,
+ u8 cursor)
{
- struct rxrpc_net *rxnet =
- container_of(work, struct rxrpc_net, peer_keepalive_work);
struct rxrpc_peer *peer;
- unsigned long delay;
- ktime_t base, now = ktime_get_real();
- s64 diff;
- u8 cursor, slot;
+ const u8 mask = ARRAY_SIZE(rxnet->peer_keepalive) - 1;
+ time64_t keepalive_at;
+ int slot;
- base = rxnet->peer_keepalive_base;
- cursor = rxnet->peer_keepalive_cursor;
+ spin_lock_bh(&rxnet->peer_hash_lock);
- _enter("%u,%lld", cursor, ktime_sub(now, base));
+ while (!list_empty(collector)) {
+ peer = list_entry(collector->next,
+ struct rxrpc_peer, keepalive_link);
-next_bucket:
- diff = ktime_to_ns(ktime_sub(now, base));
- if (diff < 0)
- goto resched;
+ list_del_init(&peer->keepalive_link);
+ if (!rxrpc_get_peer_maybe(peer))
+ continue;
- _debug("at %u", cursor);
- spin_lock_bh(&rxnet->peer_hash_lock);
-next_peer:
- if (!rxnet->live) {
spin_unlock_bh(&rxnet->peer_hash_lock);
- goto out;
- }
- /* Everything in the bucket at the cursor is processed this second; the
- * bucket at cursor + 1 goes now + 1s and so on...
- */
- if (hlist_empty(&rxnet->peer_keepalive[cursor])) {
- if (hlist_empty(&rxnet->peer_keepalive_new)) {
- spin_unlock_bh(&rxnet->peer_hash_lock);
- goto emptied_bucket;
+ keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME;
+ slot = keepalive_at - base;
+ _debug("%02x peer %u t=%d {%pISp}",
+ cursor, peer->debug_id, slot, &peer->srx.transport);
+
+ if (keepalive_at <= base ||
+ keepalive_at > base + RXRPC_KEEPALIVE_TIME) {
+ rxrpc_send_keepalive(peer);
+ slot = RXRPC_KEEPALIVE_TIME;
}
- hlist_move_list(&rxnet->peer_keepalive_new,
- &rxnet->peer_keepalive[cursor]);
+ /* A transmission to this peer occurred since last we examined
+ * it so put it into the appropriate future bucket.
+ */
+ slot += cursor;
+ slot &= mask;
+ spin_lock_bh(&rxnet->peer_hash_lock);
+ list_add_tail(&peer->keepalive_link,
+ &rxnet->peer_keepalive[slot & mask]);
+ rxrpc_put_peer(peer);
}
- peer = hlist_entry(rxnet->peer_keepalive[cursor].first,
- struct rxrpc_peer, keepalive_link);
- hlist_del_init(&peer->keepalive_link);
- if (!rxrpc_get_peer_maybe(peer))
- goto next_peer;
-
spin_unlock_bh(&rxnet->peer_hash_lock);
+}
- _debug("peer %u {%pISp}", peer->debug_id, &peer->srx.transport);
+/*
+ * Perform keep-alive pings with VERSION packets to keep any NAT alive.
+ */
+void rxrpc_peer_keepalive_worker(struct work_struct *work)
+{
+ struct rxrpc_net *rxnet =
+ container_of(work, struct rxrpc_net, peer_keepalive_work);
+ const u8 mask = ARRAY_SIZE(rxnet->peer_keepalive) - 1;
+ time64_t base, now, delay;
+ u8 cursor, stop;
+ LIST_HEAD(collector);
-recalc:
- diff = ktime_divns(ktime_sub(peer->last_tx_at, base), NSEC_PER_SEC);
- if (diff < -30 || diff > 30)
- goto send; /* LSW of 64-bit time probably wrapped on 32-bit */
- diff += RXRPC_KEEPALIVE_TIME - 1;
- if (diff < 0)
- goto send;
+ now = ktime_get_seconds();
+ base = rxnet->peer_keepalive_base;
+ cursor = rxnet->peer_keepalive_cursor;
+ _enter("%lld,%u", base - now, cursor);
- slot = (diff > RXRPC_KEEPALIVE_TIME - 1) ? RXRPC_KEEPALIVE_TIME - 1 : diff;
- if (slot == 0)
- goto send;
+ if (!rxnet->live)
+ return;
- /* A transmission to this peer occurred since last we examined it so
- * put it into the appropriate future bucket.
+ /* Remove to a temporary list all the peers that are currently lodged
+ * in expired buckets plus all new peers.
+ *
+ * Everything in the bucket at the cursor is processed this
+ * second; the bucket at cursor + 1 goes at now + 1s and so
+ * on...
*/
- slot = (slot + cursor) % ARRAY_SIZE(rxnet->peer_keepalive);
spin_lock_bh(&rxnet->peer_hash_lock);
- hlist_add_head(&peer->keepalive_link, &rxnet->peer_keepalive[slot]);
- rxrpc_put_peer(peer);
- goto next_peer;
-
-send:
- rxrpc_send_keepalive(peer);
- now = ktime_get_real();
- goto recalc;
+ list_splice_init(&rxnet->peer_keepalive_new, &collector);
+
+ stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive);
+ while (base <= now && (s8)(cursor - stop) < 0) {
+ list_splice_tail_init(&rxnet->peer_keepalive[cursor & mask],
+ &collector);
+ base++;
+ cursor++;
+ }
-emptied_bucket:
- cursor++;
- if (cursor >= ARRAY_SIZE(rxnet->peer_keepalive))
- cursor = 0;
- base = ktime_add_ns(base, NSEC_PER_SEC);
- goto next_bucket;
+ base = now;
+ spin_unlock_bh(&rxnet->peer_hash_lock);
-resched:
rxnet->peer_keepalive_base = base;
rxnet->peer_keepalive_cursor = cursor;
- delay = nsecs_to_jiffies(-diff) + 1;
- timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay);
-out:
+ rxrpc_peer_keepalive_dispatch(rxnet, &collector, base, cursor);
+ ASSERT(list_empty(&collector));
+
+ /* Schedule the timer for the next occupied timeslot. */
+ cursor = rxnet->peer_keepalive_cursor;
+ stop = cursor + RXRPC_KEEPALIVE_TIME - 1;
+ for (; (s8)(cursor - stop) < 0; cursor++) {
+ if (!list_empty(&rxnet->peer_keepalive[cursor & mask]))
+ break;
+ base++;
+ }
+
+ now = ktime_get_seconds();
+ delay = base - now;
+ if (delay < 1)
+ delay = 1;
+ delay *= HZ;
+ if (rxnet->live)
+ timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay);
+
_leave("");
}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 1b7e8107b3ae..24ec7cdcf332 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -322,7 +322,7 @@ struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local,
if (!peer) {
peer = prealloc;
hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key);
- hlist_add_head(&peer->keepalive_link, &rxnet->peer_keepalive_new);
+ list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new);
}
spin_unlock(&rxnet->peer_hash_lock);
@@ -367,8 +367,8 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
if (!peer) {
hash_add_rcu(rxnet->peer_hash,
&candidate->hash_link, hash_key);
- hlist_add_head(&candidate->keepalive_link,
- &rxnet->peer_keepalive_new);
+ list_add_tail(&candidate->keepalive_link,
+ &rxnet->peer_keepalive_new);
}
spin_unlock_bh(&rxnet->peer_hash_lock);
@@ -441,7 +441,7 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer)
spin_lock_bh(&rxnet->peer_hash_lock);
hash_del_rcu(&peer->hash_link);
- hlist_del_init(&peer->keepalive_link);
+ list_del_init(&peer->keepalive_link);
spin_unlock_bh(&rxnet->peer_hash_lock);
kfree_rcu(peer, rcu);
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 6c0ae27fff84..94262c3ead88 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -669,7 +669,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
return -EAGAIN;
}
- conn->params.peer->last_tx_at = ktime_get_real();
+ conn->params.peer->last_tx_at = ktime_get_seconds();
_leave(" = 0");
return 0;
}
@@ -725,7 +725,7 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
return -EAGAIN;
}
- conn->params.peer->last_tx_at = ktime_get_real();
+ conn->params.peer->last_tx_at = ktime_get_seconds();
_leave(" = 0");
return 0;
}
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 2ba721a590a7..a74b4d6ee186 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -122,6 +122,8 @@ static void mall_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
if (!head)
return;
+ tcf_unbind_filter(tp, &head->res);
+
if (!tc_skip_hw(head->flags))
mall_destroy_hw_filter(tp, head, (unsigned long) head, extack);
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index b49cc990a000..9cb37c63c3e5 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -468,11 +468,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
tcf_bind_filter(tp, &cr.res, base);
}
- if (old_r)
- tcf_exts_change(&r->exts, &e);
- else
- tcf_exts_change(&cr.exts, &e);
-
if (old_r && old_r != r) {
err = tcindex_filter_result_init(old_r);
if (err < 0) {
@@ -483,12 +478,15 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
oldp = p;
r->res = cr.res;
+ tcf_exts_change(&r->exts, &e);
+
rcu_assign_pointer(tp->root, cp);
if (r == &new_filter_result) {
struct tcindex_filter *nfp;
struct tcindex_filter __rcu **fp;
+ f->result.res = r->res;
tcf_exts_change(&f->result.exts, &r->exts);
fp = cp->h + (handle % cp->hash);
diff --git a/net/socket.c b/net/socket.c
index 6a6aa84b64c1..0316b380389e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2694,8 +2694,7 @@ EXPORT_SYMBOL(sock_unregister);
bool sock_is_registered(int family)
{
- return family < NPROTO &&
- rcu_access_pointer(net_families[array_index_nospec(family, NPROTO)]);
+ return family < NPROTO && rcu_access_pointer(net_families[family]);
}
static int __init sock_init(void)
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index c1076c19b858..ab27a2872935 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -451,14 +451,14 @@ static int vsock_send_shutdown(struct sock *sk, int mode)
return transport->shutdown(vsock_sk(sk), mode);
}
-void vsock_pending_work(struct work_struct *work)
+static void vsock_pending_work(struct work_struct *work)
{
struct sock *sk;
struct sock *listener;
struct vsock_sock *vsk;
bool cleanup;
- vsk = container_of(work, struct vsock_sock, dwork.work);
+ vsk = container_of(work, struct vsock_sock, pending_work.work);
sk = sk_vsock(vsk);
listener = vsk->listener;
cleanup = true;
@@ -498,7 +498,6 @@ out:
sock_put(sk);
sock_put(listener);
}
-EXPORT_SYMBOL_GPL(vsock_pending_work);
/**** SOCKET OPERATIONS ****/
@@ -597,6 +596,8 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
return retval;
}
+static void vsock_connect_timeout(struct work_struct *work);
+
struct sock *__vsock_create(struct net *net,
struct socket *sock,
struct sock *parent,
@@ -638,6 +639,8 @@ struct sock *__vsock_create(struct net *net,
vsk->sent_request = false;
vsk->ignore_connecting_rst = false;
vsk->peer_shutdown = 0;
+ INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout);
+ INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work);
psk = parent ? vsock_sk(parent) : NULL;
if (parent) {
@@ -1117,7 +1120,7 @@ static void vsock_connect_timeout(struct work_struct *work)
struct vsock_sock *vsk;
int cancel = 0;
- vsk = container_of(work, struct vsock_sock, dwork.work);
+ vsk = container_of(work, struct vsock_sock, connect_work.work);
sk = sk_vsock(vsk);
lock_sock(sk);
@@ -1221,9 +1224,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
* timeout fires.
*/
sock_hold(sk);
- INIT_DELAYED_WORK(&vsk->dwork,
- vsock_connect_timeout);
- schedule_delayed_work(&vsk->dwork, timeout);
+ schedule_delayed_work(&vsk->connect_work, timeout);
/* Skip ahead to preserve error code set above. */
goto out_wait;
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index a7a73ffe675b..cb332adb84cd 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1094,8 +1094,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
vpending->listener = sk;
sock_hold(sk);
sock_hold(pending);
- INIT_DELAYED_WORK(&vpending->dwork, vsock_pending_work);
- schedule_delayed_work(&vpending->dwork, HZ);
+ schedule_delayed_work(&vpending->pending_work, HZ);
out:
return err;
diff --git a/scripts/depmod.sh b/scripts/depmod.sh
index 9831cca31240..f41b0a4b575c 100755
--- a/scripts/depmod.sh
+++ b/scripts/depmod.sh
@@ -11,10 +11,16 @@ DEPMOD=$1
KERNELRELEASE=$2
SYMBOL_PREFIX=$3
-if ! test -r System.map -a -x "$DEPMOD"; then
+if ! test -r System.map ; then
exit 0
fi
+if [ -z $(command -v $DEPMOD) ]; then
+ echo "'make modules_install' requires $DEPMOD. Please install it." >&2
+ echo "This is probably in the kmod package." >&2
+ exit 1
+fi
+
# older versions of depmod don't support -P <symbol-prefix>
# support was added in module-init-tools 3.13
if test -n "$SYMBOL_PREFIX"; then
diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c
index 7f89d3c79a4b..753d5fc4b284 100644
--- a/sound/core/memalloc.c
+++ b/sound/core/memalloc.c
@@ -242,16 +242,12 @@ int snd_dma_alloc_pages_fallback(int type, struct device *device, size_t size,
int err;
while ((err = snd_dma_alloc_pages(type, device, size, dmab)) < 0) {
- size_t aligned_size;
if (err != -ENOMEM)
return err;
if (size <= PAGE_SIZE)
return -ENOMEM;
- aligned_size = PAGE_SIZE << get_order(size);
- if (size != aligned_size)
- size = aligned_size;
- else
- size >>= 1;
+ size >>= 1;
+ size = PAGE_SIZE << get_order(size);
}
if (! dmab->area)
return -ENOMEM;
diff --git a/sound/core/seq/oss/seq_oss.c b/sound/core/seq/oss/seq_oss.c
index 5f64d0d88320..e1f44fc86885 100644
--- a/sound/core/seq/oss/seq_oss.c
+++ b/sound/core/seq/oss/seq_oss.c
@@ -203,7 +203,7 @@ odev_poll(struct file *file, poll_table * wait)
struct seq_oss_devinfo *dp;
dp = file->private_data;
if (snd_BUG_ON(!dp))
- return -ENXIO;
+ return EPOLLERR;
return snd_seq_oss_poll(dp, file, wait);
}
diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c
index 61a07fe34cd2..ee8d0d86f0df 100644
--- a/sound/core/seq/seq_clientmgr.c
+++ b/sound/core/seq/seq_clientmgr.c
@@ -1101,7 +1101,7 @@ static __poll_t snd_seq_poll(struct file *file, poll_table * wait)
/* check client structures are in place */
if (snd_BUG_ON(!client))
- return -ENXIO;
+ return EPOLLERR;
if ((snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_INPUT) &&
client->data.user.fifo) {
diff --git a/sound/core/seq/seq_virmidi.c b/sound/core/seq/seq_virmidi.c
index 289ae6bb81d9..8ebbca554e99 100644
--- a/sound/core/seq/seq_virmidi.c
+++ b/sound/core/seq/seq_virmidi.c
@@ -163,6 +163,7 @@ static void snd_virmidi_output_trigger(struct snd_rawmidi_substream *substream,
int count, res;
unsigned char buf[32], *pbuf;
unsigned long flags;
+ bool check_resched = !in_atomic();
if (up) {
vmidi->trigger = 1;
@@ -200,6 +201,15 @@ static void snd_virmidi_output_trigger(struct snd_rawmidi_substream *substream,
vmidi->event.type = SNDRV_SEQ_EVENT_NONE;
}
}
+ if (!check_resched)
+ continue;
+ /* do temporary unlock & cond_resched() for avoiding
+ * CPU soft lockup, which may happen via a write from
+ * a huge rawmidi buffer
+ */
+ spin_unlock_irqrestore(&substream->runtime->lock, flags);
+ cond_resched();
+ spin_lock_irqsave(&substream->runtime->lock, flags);
}
out:
spin_unlock_irqrestore(&substream->runtime->lock, flags);
diff --git a/sound/pci/cs5535audio/cs5535audio.h b/sound/pci/cs5535audio/cs5535audio.h
index f4fcdf93f3c8..d84620a0c26c 100644
--- a/sound/pci/cs5535audio/cs5535audio.h
+++ b/sound/pci/cs5535audio/cs5535audio.h
@@ -67,9 +67,9 @@ struct cs5535audio_dma_ops {
};
struct cs5535audio_dma_desc {
- u32 addr;
- u16 size;
- u16 ctlreserved;
+ __le32 addr;
+ __le16 size;
+ __le16 ctlreserved;
};
struct cs5535audio_dma {
diff --git a/sound/pci/cs5535audio/cs5535audio_pcm.c b/sound/pci/cs5535audio/cs5535audio_pcm.c
index ee7065f6e162..326caec854e1 100644
--- a/sound/pci/cs5535audio/cs5535audio_pcm.c
+++ b/sound/pci/cs5535audio/cs5535audio_pcm.c
@@ -158,8 +158,8 @@ static int cs5535audio_build_dma_packets(struct cs5535audio *cs5535au,
lastdesc->addr = cpu_to_le32((u32) dma->desc_buf.addr);
lastdesc->size = 0;
lastdesc->ctlreserved = cpu_to_le16(PRD_JMP);
- jmpprd_addr = cpu_to_le32(lastdesc->addr +
- (sizeof(struct cs5535audio_dma_desc)*periods));
+ jmpprd_addr = (u32)dma->desc_buf.addr +
+ sizeof(struct cs5535audio_dma_desc) * periods;
dma->substream = substream;
dma->period_bytes = period_bytes;
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index a0c93b9c9a28..c8e6d0d08c8f 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -2207,7 +2207,7 @@ out_free:
*/
static struct snd_pci_quirk power_save_blacklist[] = {
/* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */
- SND_PCI_QUIRK(0x1849, 0x0c0c, "Asrock B85M-ITX", 0),
+ SND_PCI_QUIRK(0x1849, 0xc892, "Asrock B85M-ITX", 0),
/* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */
SND_PCI_QUIRK(0x1043, 0x8733, "Asus Prime X370-Pro", 0),
/* https://bugzilla.redhat.com/show_bug.cgi?id=1572975 */
diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index 88ce2f1022e1..16197ad4512a 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c
@@ -211,6 +211,7 @@ static void cx_auto_reboot_notify(struct hda_codec *codec)
struct conexant_spec *spec = codec->spec;
switch (codec->core.vendor_id) {
+ case 0x14f12008: /* CX8200 */
case 0x14f150f2: /* CX20722 */
case 0x14f150f4: /* CX20724 */
break;
@@ -218,13 +219,14 @@ static void cx_auto_reboot_notify(struct hda_codec *codec)
return;
}
- /* Turn the CX20722 codec into D3 to avoid spurious noises
+ /* Turn the problematic codec into D3 to avoid spurious noises
from the internal speaker during (and after) reboot */
cx_auto_turn_eapd(codec, spec->num_eapds, spec->eapds, false);
snd_hda_codec_set_power_to_all(codec, codec->core.afg, AC_PWRST_D3);
snd_hda_codec_write(codec, codec->core.afg, 0,
AC_VERB_SET_POWER_STATE, AC_PWRST_D3);
+ msleep(10);
}
static void cx_auto_free(struct hda_codec *codec)
diff --git a/sound/pci/vx222/vx222_ops.c b/sound/pci/vx222/vx222_ops.c
index d4298af6d3ee..c0d0bf44f365 100644
--- a/sound/pci/vx222/vx222_ops.c
+++ b/sound/pci/vx222/vx222_ops.c
@@ -275,7 +275,7 @@ static void vx2_dma_write(struct vx_core *chip, struct snd_pcm_runtime *runtime,
length >>= 2; /* in 32bit words */
/* Transfer using pseudo-dma. */
for (; length > 0; length--) {
- outl(cpu_to_le32(*addr), port);
+ outl(*addr, port);
addr++;
}
addr = (u32 *)runtime->dma_area;
@@ -285,7 +285,7 @@ static void vx2_dma_write(struct vx_core *chip, struct snd_pcm_runtime *runtime,
count >>= 2; /* in 32bit words */
/* Transfer using pseudo-dma. */
for (; count > 0; count--) {
- outl(cpu_to_le32(*addr), port);
+ outl(*addr, port);
addr++;
}
@@ -313,7 +313,7 @@ static void vx2_dma_read(struct vx_core *chip, struct snd_pcm_runtime *runtime,
length >>= 2; /* in 32bit words */
/* Transfer using pseudo-dma. */
for (; length > 0; length--)
- *addr++ = le32_to_cpu(inl(port));
+ *addr++ = inl(port);
addr = (u32 *)runtime->dma_area;
pipe->hw_ptr = 0;
}
@@ -321,7 +321,7 @@ static void vx2_dma_read(struct vx_core *chip, struct snd_pcm_runtime *runtime,
count >>= 2; /* in 32bit words */
/* Transfer using pseudo-dma. */
for (; count > 0; count--)
- *addr++ = le32_to_cpu(inl(port));
+ *addr++ = inl(port);
vx2_release_pseudo_dma(chip);
}
diff --git a/sound/pcmcia/vx/vxp_ops.c b/sound/pcmcia/vx/vxp_ops.c
index 8cde40226355..4c4ef1fec69f 100644
--- a/sound/pcmcia/vx/vxp_ops.c
+++ b/sound/pcmcia/vx/vxp_ops.c
@@ -375,7 +375,7 @@ static void vxp_dma_write(struct vx_core *chip, struct snd_pcm_runtime *runtime,
length >>= 1; /* in 16bit words */
/* Transfer using pseudo-dma. */
for (; length > 0; length--) {
- outw(cpu_to_le16(*addr), port);
+ outw(*addr, port);
addr++;
}
addr = (unsigned short *)runtime->dma_area;
@@ -385,7 +385,7 @@ static void vxp_dma_write(struct vx_core *chip, struct snd_pcm_runtime *runtime,
count >>= 1; /* in 16bit words */
/* Transfer using pseudo-dma. */
for (; count > 0; count--) {
- outw(cpu_to_le16(*addr), port);
+ outw(*addr, port);
addr++;
}
vx_release_pseudo_dma(chip);
@@ -417,7 +417,7 @@ static void vxp_dma_read(struct vx_core *chip, struct snd_pcm_runtime *runtime,
length >>= 1; /* in 16bit words */
/* Transfer using pseudo-dma. */
for (; length > 0; length--)
- *addr++ = le16_to_cpu(inw(port));
+ *addr++ = inw(port);
addr = (unsigned short *)runtime->dma_area;
pipe->hw_ptr = 0;
}
@@ -425,12 +425,12 @@ static void vxp_dma_read(struct vx_core *chip, struct snd_pcm_runtime *runtime,
count >>= 1; /* in 16bit words */
/* Transfer using pseudo-dma. */
for (; count > 1; count--)
- *addr++ = le16_to_cpu(inw(port));
+ *addr++ = inw(port);
/* Disable DMA */
pchip->regDIALOG &= ~VXP_DLG_DMAREAD_SEL_MASK;
vx_outb(chip, DIALOG, pchip->regDIALOG);
/* Read the last word (16 bits) */
- *addr = le16_to_cpu(inw(port));
+ *addr = inw(port);
/* Disable 16-bit accesses */
pchip->regDIALOG &= ~VXP_DLG_DMA16_SEL_MASK;
vx_outb(chip, DIALOG, pchip->regDIALOG);
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 578793e97431..f8659f070fc6 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -198,7 +198,6 @@
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
-
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
@@ -207,13 +206,20 @@
#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */
-
+#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
+#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */
-
#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
+#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
+#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */
+#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */
+#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
+#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
+#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -274,9 +280,10 @@
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
-#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
-#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
-#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
+#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
+#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
@@ -333,7 +340,9 @@
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
/*
* BUG word(s)
@@ -363,5 +372,7 @@
#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
+#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
+#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index af5f8c2df87a..db9f15f5db04 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -207,4 +207,16 @@ struct prctl_mm_map {
# define PR_SVE_VL_LEN_MASK 0xffff
# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */
+/* Per task speculation control */
+#define PR_GET_SPECULATION_CTRL 52
+#define PR_SET_SPECULATION_CTRL 53
+/* Speculation control variants */
+# define PR_SPEC_STORE_BYPASS 0
+/* Return and control values for PR_SET/GET_SPECULATION_CTRL */
+# define PR_SPEC_NOT_AFFECTED 0
+# define PR_SPEC_PRCTL (1UL << 0)
+# define PR_SPEC_ENABLE (1UL << 1)
+# define PR_SPEC_DISABLE (1UL << 2)
+# define PR_SPEC_FORCE_DISABLE (1UL << 3)
+
#endif /* _LINUX_PRCTL_H */