From 10c02aad111df02088d1a81792a709f6a7eca6cc Mon Sep 17 00:00:00 2001 From: Sebastian Ene Date: Wed, 24 Jan 2024 09:10:28 +0000 Subject: KVM: arm64: Fix circular locking dependency The rule inside kvm enforces that the vcpu->mutex is taken *inside* kvm->lock. The rule is violated by the pkvm_create_hyp_vm() which acquires the kvm->lock while already holding the vcpu->mutex lock from kvm_vcpu_ioctl(). Avoid the circular locking dependency altogether by protecting the hyp vm handle with the config_lock, much like we already do for other forms of VM-scoped data. Signed-off-by: Sebastian Ene Cc: stable@vger.kernel.org Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240124091027.1477174-2-sebastianene@google.com --- arch/arm64/kvm/pkvm.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 8350fb8fee0b..b7be96a53597 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -101,6 +101,17 @@ void __init kvm_hyp_reserve(void) hyp_mem_base); } +static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm) +{ + if (host_kvm->arch.pkvm.handle) { + WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, + host_kvm->arch.pkvm.handle)); + } + + host_kvm->arch.pkvm.handle = 0; + free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); +} + /* * Allocates and donates memory for hypervisor VM structs at EL2. * @@ -181,7 +192,7 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm) return 0; destroy_vm: - pkvm_destroy_hyp_vm(host_kvm); + __pkvm_destroy_hyp_vm(host_kvm); return ret; free_vm: free_pages_exact(hyp_vm, hyp_vm_sz); @@ -194,23 +205,19 @@ int pkvm_create_hyp_vm(struct kvm *host_kvm) { int ret = 0; - mutex_lock(&host_kvm->lock); + mutex_lock(&host_kvm->arch.config_lock); if (!host_kvm->arch.pkvm.handle) ret = __pkvm_create_hyp_vm(host_kvm); - mutex_unlock(&host_kvm->lock); + mutex_unlock(&host_kvm->arch.config_lock); return ret; } void pkvm_destroy_hyp_vm(struct kvm *host_kvm) { - if (host_kvm->arch.pkvm.handle) { - WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, - host_kvm->arch.pkvm.handle)); - } - - host_kvm->arch.pkvm.handle = 0; - free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc); + mutex_lock(&host_kvm->arch.config_lock); + __pkvm_destroy_hyp_vm(host_kvm); + mutex_unlock(&host_kvm->arch.config_lock); } int pkvm_init_host_vm(struct kvm *host_kvm) -- cgit v1.2.3 From 6231c9e1a9f35b535c66709aa8a6eda40dbc4132 Mon Sep 17 00:00:00 2001 From: Prasad Pandit Date: Wed, 3 Jan 2024 13:23:43 +0530 Subject: KVM: x86: make KVM_REQ_NMI request iff NMI pending for vcpu kvm_vcpu_ioctl_x86_set_vcpu_events() routine makes 'KVM_REQ_NMI' request for a vcpu even when its 'events->nmi.pending' is zero. Ex: qemu_thread_start kvm_vcpu_thread_fn qemu_wait_io_event qemu_wait_io_event_common process_queued_cpu_work do_kvm_cpu_synchronize_post_init/_reset kvm_arch_put_registers kvm_put_vcpu_events (cpu, level=[2|3]) This leads vCPU threads in QEMU to constantly acquire & release the global mutex lock, delaying the guest boot due to lock contention. Add check to make KVM_REQ_NMI request only if vcpu has NMI pending. Fixes: bdedff263132 ("KVM: x86: Route pending NMIs from userspace through process_nmi()") Cc: stable@vger.kernel.org Signed-off-by: Prasad Pandit Link: https://lore.kernel.org/r/20240103075343.549293-1-ppandit@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 363b1c080205..25bc52cdf8f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5454,7 +5454,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) { vcpu->arch.nmi_pending = 0; atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending); - kvm_make_request(KVM_REQ_NMI, vcpu); + if (events->nmi.pending) + kvm_make_request(KVM_REQ_NMI, vcpu); } static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); -- cgit v1.2.3 From 9e62797fd7e8eef9c8a3f7b54b57fbc9caf6a20a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 9 Jan 2024 15:11:21 +0100 Subject: KVM: x86: Make gtod_is_based_on_tsc() return 'bool' gtod_is_based_on_tsc() is boolean in nature, i.e. it returns '1' for good clocksources and '0' otherwise. Moreover, its result is used raw by kvm_get_time_and_clockread()/kvm_get_walltime_and_clockread() which are 'bool'. No functional change intended. Signed-off-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20240109141121.1619463-6-vkuznets@redhat.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 363b1c080205..aa27aec10860 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2507,7 +2507,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) } #ifdef CONFIG_X86_64 -static inline int gtod_is_based_on_tsc(int mode) +static inline bool gtod_is_based_on_tsc(int mode) { return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; } -- cgit v1.2.3 From 05519c86d6997cfb9bb6c82ce1595d1015b718dc Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Tue, 23 Jan 2024 22:12:20 +0000 Subject: KVM: x86/pmu: Fix type length error when reading pmu->fixed_ctr_ctrl Use a u64 instead of a u8 when taking a snapshot of pmu->fixed_ctr_ctrl when reprogramming fixed counters, as truncating the value results in KVM thinking fixed counter 2 is already disabled (the bug also affects fixed counters 3+, but KVM doesn't yet support those). As a result, if the guest disables fixed counter 2, KVM will get a false negative and fail to reprogram/disable emulation of the counter, which can leads to incorrect counts and spurious PMIs in the guest. Fixes: 76d287b2342e ("KVM: x86/pmu: Drop "u8 ctrl, int idx" for reprogram_fixed_counter()") Cc: stable@vger.kernel.org Signed-off-by: Mingwei Zhang Link: https://lore.kernel.org/r/20240123221220.3911317-1-mizhang@google.com [sean: rewrite changelog to call out the effects of the bug] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/pmu_intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index a6216c874729..315c7c2ba89b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -71,7 +71,7 @@ static int fixed_pmc_events[] = { static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { struct kvm_pmc *pmc; - u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; + u64 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; int i; pmu->fixed_ctr_ctrl = data; -- cgit v1.2.3 From 42dfa94d802a48c871e2017cbf86153270c86632 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 4 Feb 2024 16:43:05 +0900 Subject: KVM: arm64: Do not source virt/lib/Kconfig twice For ARCH=arm64, virt/lib/Kconfig is sourced twice, from arch/arm64/kvm/Kconfig and from drivers/vfio/Kconfig. There is no good reason to parse virt/lib/Kconfig twice. Commit 2412405b3141 ("KVM: arm/arm64: register irq bypass consumer on ARM/ARM64") should not have added this 'source' directive. Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240204074305.31492-1-masahiroy@kernel.org --- arch/arm64/kvm/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 6c3c8ca73e7f..27ca89b628a0 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -3,7 +3,6 @@ # KVM configuration # -source "virt/lib/Kconfig" source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION -- cgit v1.2.3 From 8ded03ae48b3657e0fbca99d8a9d8fa1bfb8c9be Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Fri, 2 Feb 2024 18:26:46 -0600 Subject: powerpc/pseries/papr-sysparm: use u8 arrays for payloads Some PAPR system parameter values are formatted by firmware as nul-terminated strings (e.g. LPAR name, shared processor attributes). But the values returned for other parameters, such as processor module info and TLB block invalidate characteristics, are binary data with parameter-specific layouts. So char[] isn't the appropriate type for the general case. Use u8/__u8. Signed-off-by: Nathan Lynch Fixes: 905b9e48786e ("powerpc/pseries/papr-sysparm: Expose character device to user space") Signed-off-by: Michael Ellerman Link: https://msgid.link/20240202-papr-sysparm-ioblock-data-use-u8-v1-1-f5c6c89f65ec@linux.ibm.com --- arch/powerpc/include/asm/papr-sysparm.h | 2 +- arch/powerpc/include/uapi/asm/papr-sysparm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/papr-sysparm.h b/arch/powerpc/include/asm/papr-sysparm.h index 0dbbff59101d..c3cd5b131033 100644 --- a/arch/powerpc/include/asm/papr-sysparm.h +++ b/arch/powerpc/include/asm/papr-sysparm.h @@ -32,7 +32,7 @@ typedef struct { */ struct papr_sysparm_buf { __be16 len; - char val[PAPR_SYSPARM_MAX_OUTPUT]; + u8 val[PAPR_SYSPARM_MAX_OUTPUT]; }; struct papr_sysparm_buf *papr_sysparm_buf_alloc(void); diff --git a/arch/powerpc/include/uapi/asm/papr-sysparm.h b/arch/powerpc/include/uapi/asm/papr-sysparm.h index 9f9a0f267ea5..f733467b1534 100644 --- a/arch/powerpc/include/uapi/asm/papr-sysparm.h +++ b/arch/powerpc/include/uapi/asm/papr-sysparm.h @@ -14,7 +14,7 @@ enum { struct papr_sysparm_io_block { __u32 parameter; __u16 length; - char data[PAPR_SYSPARM_MAX_OUTPUT]; + __u8 data[PAPR_SYSPARM_MAX_OUTPUT]; }; /** -- cgit v1.2.3 From ed8b94f6e0acd652ce69bd69d678a0c769172df8 Mon Sep 17 00:00:00 2001 From: Gaurav Batra Date: Mon, 22 Jan 2024 16:24:07 -0600 Subject: powerpc/pseries/iommu: Fix iommu initialisation during DLPAR add When a PCI device is dynamically added, the kernel oopses with a NULL pointer dereference: BUG: Kernel NULL pointer dereference on read at 0x00000030 Faulting instruction address: 0xc0000000006bbe5c Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache netfs xsk_diag bonding nft_compat nf_tables nfnetlink rfkill binfmt_misc dm_multipath rpcrdma sunrpc rdma_ucm ib_srpt ib_isert iscsi_target_mod target_core_mod ib_umad ib_iser libiscsi scsi_transport_iscsi ib_ipoib rdma_cm iw_cm ib_cm mlx5_ib ib_uverbs ib_core pseries_rng drm drm_panel_orientation_quirks xfs libcrc32c mlx5_core mlxfw sd_mod t10_pi sg tls ibmvscsi ibmveth scsi_transport_srp vmx_crypto pseries_wdt psample dm_mirror dm_region_hash dm_log dm_mod fuse CPU: 17 PID: 2685 Comm: drmgr Not tainted 6.7.0-203405+ #66 Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_008) hv:phyp pSeries NIP: c0000000006bbe5c LR: c000000000a13e68 CTR: c0000000000579f8 REGS: c00000009924f240 TRAP: 0300 Not tainted (6.7.0-203405+) MSR: 8000000000009033 CR: 24002220 XER: 20040006 CFAR: c000000000a13e64 DAR: 0000000000000030 DSISR: 40000000 IRQMASK: 0 ... NIP sysfs_add_link_to_group+0x34/0x94 LR iommu_device_link+0x5c/0x118 Call Trace: iommu_init_device+0x26c/0x318 (unreliable) iommu_device_link+0x5c/0x118 iommu_init_device+0xa8/0x318 iommu_probe_device+0xc0/0x134 iommu_bus_notifier+0x44/0x104 notifier_call_chain+0xb8/0x19c blocking_notifier_call_chain+0x64/0x98 bus_notify+0x50/0x7c device_add+0x640/0x918 pci_device_add+0x23c/0x298 of_create_pci_dev+0x400/0x884 of_scan_pci_dev+0x124/0x1b0 __of_scan_bus+0x78/0x18c pcibios_scan_phb+0x2a4/0x3b0 init_phb_dynamic+0xb8/0x110 dlpar_add_slot+0x170/0x3b8 [rpadlpar_io] add_slot_store.part.0+0xb4/0x130 [rpadlpar_io] kobj_attr_store+0x2c/0x48 sysfs_kf_write+0x64/0x78 kernfs_fop_write_iter+0x1b0/0x290 vfs_write+0x350/0x4a0 ksys_write+0x84/0x140 system_call_exception+0x124/0x330 system_call_vectored_common+0x15c/0x2ec Commit a940904443e4 ("powerpc/iommu: Add iommu_ops to report capabilities and allow blocking domains") broke DLPAR add of PCI devices. The above added iommu_device structure to pci_controller. During system boot, PCI devices are discovered and this newly added iommu_device structure is initialized by a call to iommu_device_register(). During DLPAR add of a PCI device, a new pci_controller structure is allocated but there are no calls made to iommu_device_register() interface. Fix is to register the iommu device during DLPAR add as well. Fixes: a940904443e4 ("powerpc/iommu: Add iommu_ops to report capabilities and allow blocking domains") Signed-off-by: Gaurav Batra [mpe: Trim oops and tweak some change log wording] Signed-off-by: Michael Ellerman Link: https://msgid.link/20240122222407.39603-1-gbatra@linux.ibm.com --- arch/powerpc/include/asm/ppc-pci.h | 3 +++ arch/powerpc/kernel/iommu.c | 21 ++++++++++++++++----- arch/powerpc/platforms/pseries/pci_dlpar.c | 4 ++++ 3 files changed, 23 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index ce2b1b5eebdd..c3a3f3df36d1 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -29,6 +29,9 @@ void *pci_traverse_device_nodes(struct device_node *start, void *(*fn)(struct device_node *, void *), void *data); extern void pci_devs_phb_init_dynamic(struct pci_controller *phb); +extern void ppc_iommu_register_device(struct pci_controller *phb); +extern void ppc_iommu_unregister_device(struct pci_controller *phb); + /* From rtas_pci.h */ extern void init_pci_config_tokens (void); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index ebe259bdd462..c6f62e130d55 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1388,6 +1388,21 @@ static const struct attribute_group *spapr_tce_iommu_groups[] = { NULL, }; +void ppc_iommu_register_device(struct pci_controller *phb) +{ + iommu_device_sysfs_add(&phb->iommu, phb->parent, + spapr_tce_iommu_groups, "iommu-phb%04x", + phb->global_number); + iommu_device_register(&phb->iommu, &spapr_tce_iommu_ops, + phb->parent); +} + +void ppc_iommu_unregister_device(struct pci_controller *phb) +{ + iommu_device_unregister(&phb->iommu); + iommu_device_sysfs_remove(&phb->iommu); +} + /* * This registers IOMMU devices of PHBs. This needs to happen * after core_initcall(iommu_init) + postcore_initcall(pci_driver_init) and @@ -1398,11 +1413,7 @@ static int __init spapr_tce_setup_phb_iommus_initcall(void) struct pci_controller *hose; list_for_each_entry(hose, &hose_list, list_node) { - iommu_device_sysfs_add(&hose->iommu, hose->parent, - spapr_tce_iommu_groups, "iommu-phb%04x", - hose->global_number); - iommu_device_register(&hose->iommu, &spapr_tce_iommu_ops, - hose->parent); + ppc_iommu_register_device(hose); } return 0; } diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 4ba824568119..4448386268d9 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -35,6 +35,8 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) pseries_msi_allocate_domains(phb); + ppc_iommu_register_device(phb); + /* Create EEH devices for the PHB */ eeh_phb_pe_create(phb); @@ -76,6 +78,8 @@ int remove_phb_dynamic(struct pci_controller *phb) } } + ppc_iommu_unregister_device(phb); + pseries_msi_free_domains(phb); /* Keep a reference so phb isn't freed yet */ -- cgit v1.2.3 From aad98efd0b121f63a2e1c221dcb4d4850128c697 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 2 Feb 2024 21:13:16 +0530 Subject: powerpc/64: Set task pt_regs->link to the LR value on scv entry Nysal reported that userspace backtraces are missing in offcputime bcc tool. As an example: $ sudo ./bcc/tools/offcputime.py -uU Tracing off-CPU time (us) of user threads by user stack... Hit Ctrl-C to end. ^C write - python (9107) 8 write - sudo (9105) 9 mmap - python (9107) 16 clock_nanosleep - multipathd (697) 3001604 The offcputime bcc tool attaches a bpf program to a kprobe on finish_task_switch(), which is usually hit on a syscall from userspace. With the switch to system call vectored, we started setting pt_regs->link to zero. This is because system call vectored behaves like a function call with LR pointing to the system call return address, and with no modification to SRR0/SRR1. The LR value does indicate our next instruction, so it is being saved as pt_regs->nip, and pt_regs->link is being set to zero. This is not a problem by itself, but BPF uses perf callchain infrastructure for capturing stack traces, and that stores LR as the second entry in the stack trace. perf has code to cope with the second entry being zero, and skips over it. However, generic userspace unwinders assume that a zero entry indicates end of the stack trace, resulting in a truncated userspace stack trace. Rather than fixing all userspace unwinders to ignore/skip past the second entry, store the real LR value in pt_regs->link so that there continues to be a valid, though duplicate entry in the stack trace. With this change: $ sudo ./bcc/tools/offcputime.py -uU Tracing off-CPU time (us) of user threads by user stack... Hit Ctrl-C to end. ^C write write [unknown] [unknown] [unknown] [unknown] [unknown] PyObject_VectorcallMethod [unknown] [unknown] PyObject_CallOneArg PyFile_WriteObject PyFile_WriteString [unknown] [unknown] PyObject_Vectorcall _PyEval_EvalFrameDefault PyEval_EvalCode [unknown] [unknown] [unknown] _PyRun_SimpleFileObject _PyRun_AnyFileObject Py_RunMain [unknown] Py_BytesMain [unknown] __libc_start_main - python (1293) 7 write write [unknown] sudo_ev_loop_v1 sudo_ev_dispatch_v1 [unknown] [unknown] [unknown] [unknown] __libc_start_main - sudo (1291) 7 syscall syscall bpf_open_perf_buffer_opts [unknown] [unknown] [unknown] [unknown] _PyObject_MakeTpCall PyObject_Vectorcall _PyEval_EvalFrameDefault PyEval_EvalCode [unknown] [unknown] [unknown] _PyRun_SimpleFileObject _PyRun_AnyFileObject Py_RunMain [unknown] Py_BytesMain [unknown] __libc_start_main - python (1293) 11 clock_nanosleep clock_nanosleep nanosleep sleep [unknown] [unknown] __clone - multipathd (698) 3001661 Fixes: 7fa95f9adaee ("powerpc/64s: system call support for scv/rfscv instructions") Cc: stable@vger.kernel.org Reported-by: "Nysal Jan K.A" Signed-off-by: Naveen N Rao Signed-off-by: Michael Ellerman Link: https://msgid.link/20240202154316.395276-1-naveen@kernel.org --- arch/powerpc/kernel/interrupt_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index bd863702d812..1ad059a9e2fe 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -52,7 +52,8 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_\name) mr r10,r1 ld r1,PACAKSAVE(r13) std r10,0(r1) - std r11,_NIP(r1) + std r11,_LINK(r1) + std r11,_NIP(r1) /* Saved LR is also the next instruction */ std r12,_MSR(r1) std r0,GPR0(r1) std r10,GPR1(r1) @@ -70,7 +71,6 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_\name) std r9,GPR13(r1) SAVE_NVGPRS(r1) std r11,_XER(r1) - std r11,_LINK(r1) std r11,_CTR(r1) li r11,\trapnr -- cgit v1.2.3 From a038a3ff8c6582404834852c043dadc73a5b68b4 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 24 Jan 2024 11:38:38 +0100 Subject: powerpc/6xx: set High BAT Enable flag on G2_LE cores MMU_FTR_USE_HIGH_BATS is set for G2_LE cores and derivatives like e300cX, but the high BATs need to be enabled in HID2 to work. Add register definitions and add the needed setup to __setup_cpu_603. This fixes boot on CPUs like the MPC5200B with STRICT_KERNEL_RWX enabled on systems where the flag has not been set by the bootloader already. Fixes: e4d6654ebe6e ("powerpc/mm/32s: rework mmu_mapin_ram()") Signed-off-by: Matthias Schiffer Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/20240124103838.43675-1-matthias.schiffer@ew.tq-group.com --- arch/powerpc/include/asm/reg.h | 2 ++ arch/powerpc/kernel/cpu_setup_6xx.S | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 7fd09f25452d..bb47af9054a9 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -617,6 +617,8 @@ #endif #define SPRN_HID2 0x3F8 /* Hardware Implementation Register 2 */ #define SPRN_HID2_GEKKO 0x398 /* Gekko HID2 Register */ +#define SPRN_HID2_G2_LE 0x3F3 /* G2_LE HID2 Register */ +#define HID2_G2_LE_HBE (1<<18) /* High BAT Enable (G2_LE) */ #define SPRN_IABR 0x3F2 /* Instruction Address Breakpoint Register */ #define SPRN_IABR2 0x3FA /* 83xx */ #define SPRN_IBCR 0x135 /* 83xx Insn Breakpoint Control Reg */ diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S index f29ce3dd6140..bfd3f442e5eb 100644 --- a/arch/powerpc/kernel/cpu_setup_6xx.S +++ b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -26,6 +26,15 @@ BEGIN_FTR_SECTION bl __init_fpu_registers END_FTR_SECTION_IFCLR(CPU_FTR_FPU_UNAVAILABLE) bl setup_common_caches + + /* + * This assumes that all cores using __setup_cpu_603 with + * MMU_FTR_USE_HIGH_BATS are G2_LE compatible + */ +BEGIN_MMU_FTR_SECTION + bl setup_g2_le_hid2 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) + mtlr r5 blr _GLOBAL(__setup_cpu_604) @@ -115,6 +124,16 @@ SYM_FUNC_START_LOCAL(setup_604_hid0) blr SYM_FUNC_END(setup_604_hid0) +/* Enable high BATs for G2_LE and derivatives like e300cX */ +SYM_FUNC_START_LOCAL(setup_g2_le_hid2) + mfspr r11,SPRN_HID2_G2_LE + oris r11,r11,HID2_G2_LE_HBE@h + mtspr SPRN_HID2_G2_LE,r11 + sync + isync + blr +SYM_FUNC_END(setup_g2_le_hid2) + /* 7400 <= rev 2.7 and 7410 rev = 1.0 suffer from some * erratas we work around here. * Moto MPC710CE.pdf describes them, those are errata @@ -495,4 +514,3 @@ _GLOBAL(__restore_cpu_setup) mtcr r7 blr _ASM_NOKPROBE_SYMBOL(__restore_cpu_setup) - -- cgit v1.2.3 From 4a7aee96200ad281a5cc4cf5c7a2e2a49d2b97b0 Mon Sep 17 00:00:00 2001 From: Jiangfeng Xiao Date: Tue, 23 Jan 2024 09:45:59 +0800 Subject: powerpc/kasan: Fix addr error caused by page alignment In kasan_init_region, when k_start is not page aligned, at the begin of for loop, k_cur = k_start & PAGE_MASK is less than k_start, and then `va = block + k_cur - k_start` is less than block, the addr va is invalid, because the memory address space from va to block is not alloced by memblock_alloc, which will not be reserved by memblock_reserve later, it will be used by other places. As a result, memory overwriting occurs. for example: int __init __weak kasan_init_region(void *start, size_t size) { [...] /* if say block(dcd97000) k_start(feef7400) k_end(feeff3fe) */ block = memblock_alloc(k_end - k_start, PAGE_SIZE); [...] for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { /* at the begin of for loop * block(dcd97000) va(dcd96c00) k_cur(feef7000) k_start(feef7400) * va(dcd96c00) is less than block(dcd97000), va is invalid */ void *va = block + k_cur - k_start; [...] } [...] } Therefore, page alignment is performed on k_start before memblock_alloc() to ensure the validity of the VA address. Fixes: 663c0c9496a6 ("powerpc/kasan: Fix shadow area set up for modules.") Signed-off-by: Jiangfeng Xiao Signed-off-by: Michael Ellerman Link: https://msgid.link/1705974359-43790-1-git-send-email-xiaojiangfeng@huawei.com --- arch/powerpc/mm/kasan/init_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/powerpc/mm/kasan/init_32.c b/arch/powerpc/mm/kasan/init_32.c index a70828a6d935..aa9aa11927b2 100644 --- a/arch/powerpc/mm/kasan/init_32.c +++ b/arch/powerpc/mm/kasan/init_32.c @@ -64,6 +64,7 @@ int __init __weak kasan_init_region(void *start, size_t size) if (ret) return ret; + k_start = k_start & PAGE_MASK; block = memblock_alloc(k_end - k_start, PAGE_SIZE); if (!block) return -ENOMEM; -- cgit v1.2.3 From 3376ca3f1a2075eaa23c5576c47d04d7e8a4adda Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 3 Feb 2024 13:45:20 +0100 Subject: KVM: x86: Fix KVM_GET_MSRS stack info leak Commit 6abe9c1386e5 ("KVM: X86: Move ignore_msrs handling upper the stack") changed the 'ignore_msrs' handling, including sanitizing return values to the caller. This was fine until commit 12bc2132b15e ("KVM: X86: Do the same ignore_msrs check for feature msrs") which allowed non-existing feature MSRs to be ignored, i.e. to not generate an error on the ioctl() level. It even tried to preserve the sanitization of the return value. However, the logic is flawed, as '*data' will be overwritten again with the uninitialized stack value of msr.data. Fix this by simplifying the logic and always initializing msr.data, vanishing the need for an additional error exit path. Fixes: 12bc2132b15e ("KVM: X86: Do the same ignore_msrs check for feature msrs") Signed-off-by: Mathias Krause Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240203124522.592778-2-minipli@grsecurity.net Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 25bc52cdf8f4..3f6e82fd37f3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1704,22 +1704,17 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) struct kvm_msr_entry msr; int r; + /* Unconditionally clear the output for simplicity */ + msr.data = 0; msr.index = index; r = kvm_get_msr_feature(&msr); - if (r == KVM_MSR_RET_INVALID) { - /* Unconditionally clear the output for simplicity */ - *data = 0; - if (kvm_msr_ignored_check(index, 0, false)) - r = 0; - } - - if (r) - return r; + if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false)) + r = 0; *data = msr.data; - return 0; + return r; } static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) -- cgit v1.2.3 From 5c84bc8b617bf90e722cc57d447abd9a468d3a52 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Jan 2024 13:51:41 +0100 Subject: powerpc: udbg_memcons: mark functions static ppc64_book3e_allmodconfig has one more driver that triggeres a few missing-prototypes warnings: arch/powerpc/sysdev/udbg_memcons.c:44:6: error: no previous prototype for 'memcons_putc' [-Werror=missing-prototypes] arch/powerpc/sysdev/udbg_memcons.c:57:5: error: no previous prototype for 'memcons_getc_poll' [-Werror=missing-prototypes] arch/powerpc/sysdev/udbg_memcons.c:80:5: error: no previous prototype for 'memcons_getc' [-Werror=missing-prototypes] Mark all these function static as there are no other users. Signed-off-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://msgid.link/20240123125148.2004648-1-arnd@kernel.org --- arch/powerpc/sysdev/udbg_memcons.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/sysdev/udbg_memcons.c b/arch/powerpc/sysdev/udbg_memcons.c index 5020044400dc..4de57ba52236 100644 --- a/arch/powerpc/sysdev/udbg_memcons.c +++ b/arch/powerpc/sysdev/udbg_memcons.c @@ -41,7 +41,7 @@ struct memcons memcons = { .input_end = &memcons_input[CONFIG_PPC_MEMCONS_INPUT_SIZE], }; -void memcons_putc(char c) +static void memcons_putc(char c) { char *new_output_pos; @@ -54,7 +54,7 @@ void memcons_putc(char c) memcons.output_pos = new_output_pos; } -int memcons_getc_poll(void) +static int memcons_getc_poll(void) { char c; char *new_input_pos; @@ -77,7 +77,7 @@ int memcons_getc_poll(void) return -1; } -int memcons_getc(void) +static int memcons_getc(void) { int c; -- cgit v1.2.3 From 1c57b9f63ab34f01b8c73731cc0efacb5a9a2f16 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Jan 2024 13:51:42 +0100 Subject: powerpc: 85xx: mark local functions static These functions are either used in only one file and can just be made static or need an #include statement to avoid a warning: arch/powerpc/platforms/85xx/mpc8536_ds.c:30:13: error: no previous prototype for 'mpc8536_ds_pic_init' [-Werror=missing-prototypes] arch/powerpc/platforms/85xx/p1010rdb.c:27:13: error: no previous prototype for 'p1010_rdb_pic_init' [-Werror=missing-prototypes] arch/powerpc/platforms/85xx/p1022_ds.c:373:6: error: no previous prototype for 'p1022ds_set_pixel_clock' [-Werror=missing-prototypes] arch/powerpc/platforms/85xx/p1022_ds.c:422:1: error: no previous prototype for 'p1022ds_valid_monitor_port' [-Werror=missing-prototypes] arch/powerpc/platforms/85xx/p1022_ds.c:435:13: error: no previous prototype for 'p1022_ds_pic_init' [-Werror=missing-prototypes] arch/powerpc/platforms/85xx/p1022_rdk.c:43:6: error: no previous prototype for 'p1022rdk_set_pixel_clock' [-Werror=missing-prototypes] arch/powerpc/platforms/85xx/p1022_rdk.c:92:1: error: no previous prototype for 'p1022rdk_valid_monitor_port' [-Werror=missing-prototypes] arch/powerpc/platforms/85xx/p1022_rdk.c:99:13: error: no previous prototype for 'p1022_rdk_pic_init' [-Werror=missing-prototypes] arch/powerpc/platforms/85xx/socrates_fpga_pic.c:273:13: error: no previous prototype for 'socrates_fpga_pic_init' [-Werror=missing-prototypes] arch/powerpc/platforms/85xx/xes_mpc85xx.c:40:13: error: no previous prototype for 'xes_mpc85xx_pic_init' [-Werror=missing-prototypes] arch/powerpc/platforms/85xx/mvme2500.c:24:13: error: no previous prototype for 'mvme2500_pic_init' [-Werror=missing-prototypes] Signed-off-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://msgid.link/20240123125148.2004648-2-arnd@kernel.org --- arch/powerpc/platforms/85xx/mpc8536_ds.c | 2 +- arch/powerpc/platforms/85xx/mvme2500.c | 2 +- arch/powerpc/platforms/85xx/p1010rdb.c | 2 +- arch/powerpc/platforms/85xx/p1022_ds.c | 6 +++--- arch/powerpc/platforms/85xx/p1022_rdk.c | 6 +++--- arch/powerpc/platforms/85xx/socrates_fpga_pic.c | 2 ++ arch/powerpc/platforms/85xx/xes_mpc85xx.c | 2 +- 7 files changed, 12 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/85xx/mpc8536_ds.c b/arch/powerpc/platforms/85xx/mpc8536_ds.c index e966b2ad8ecd..b3327a358eb4 100644 --- a/arch/powerpc/platforms/85xx/mpc8536_ds.c +++ b/arch/powerpc/platforms/85xx/mpc8536_ds.c @@ -27,7 +27,7 @@ #include "mpc85xx.h" -void __init mpc8536_ds_pic_init(void) +static void __init mpc8536_ds_pic_init(void) { struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN, 0, 256, " OpenPIC "); diff --git a/arch/powerpc/platforms/85xx/mvme2500.c b/arch/powerpc/platforms/85xx/mvme2500.c index 1b59e45a0c64..19122daadb55 100644 --- a/arch/powerpc/platforms/85xx/mvme2500.c +++ b/arch/powerpc/platforms/85xx/mvme2500.c @@ -21,7 +21,7 @@ #include "mpc85xx.h" -void __init mvme2500_pic_init(void) +static void __init mvme2500_pic_init(void) { struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN | MPIC_SINGLE_DEST_CPU, diff --git a/arch/powerpc/platforms/85xx/p1010rdb.c b/arch/powerpc/platforms/85xx/p1010rdb.c index 10d6f1fa3327..491895ac8bcf 100644 --- a/arch/powerpc/platforms/85xx/p1010rdb.c +++ b/arch/powerpc/platforms/85xx/p1010rdb.c @@ -24,7 +24,7 @@ #include "mpc85xx.h" -void __init p1010_rdb_pic_init(void) +static void __init p1010_rdb_pic_init(void) { struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN | MPIC_SINGLE_DEST_CPU, diff --git a/arch/powerpc/platforms/85xx/p1022_ds.c b/arch/powerpc/platforms/85xx/p1022_ds.c index 0dd786a061a6..adc3a2ee1415 100644 --- a/arch/powerpc/platforms/85xx/p1022_ds.c +++ b/arch/powerpc/platforms/85xx/p1022_ds.c @@ -370,7 +370,7 @@ exit: * * @pixclock: the wavelength, in picoseconds, of the clock */ -void p1022ds_set_pixel_clock(unsigned int pixclock) +static void p1022ds_set_pixel_clock(unsigned int pixclock) { struct device_node *guts_np = NULL; struct ccsr_guts __iomem *guts; @@ -418,7 +418,7 @@ void p1022ds_set_pixel_clock(unsigned int pixclock) /** * p1022ds_valid_monitor_port: set the monitor port for sysfs */ -enum fsl_diu_monitor_port +static enum fsl_diu_monitor_port p1022ds_valid_monitor_port(enum fsl_diu_monitor_port port) { switch (port) { @@ -432,7 +432,7 @@ p1022ds_valid_monitor_port(enum fsl_diu_monitor_port port) #endif -void __init p1022_ds_pic_init(void) +static void __init p1022_ds_pic_init(void) { struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN | MPIC_SINGLE_DEST_CPU, diff --git a/arch/powerpc/platforms/85xx/p1022_rdk.c b/arch/powerpc/platforms/85xx/p1022_rdk.c index 25ab6e9c1470..6198299d95b1 100644 --- a/arch/powerpc/platforms/85xx/p1022_rdk.c +++ b/arch/powerpc/platforms/85xx/p1022_rdk.c @@ -40,7 +40,7 @@ * * @pixclock: the wavelength, in picoseconds, of the clock */ -void p1022rdk_set_pixel_clock(unsigned int pixclock) +static void p1022rdk_set_pixel_clock(unsigned int pixclock) { struct device_node *guts_np = NULL; struct ccsr_guts __iomem *guts; @@ -88,7 +88,7 @@ void p1022rdk_set_pixel_clock(unsigned int pixclock) /** * p1022rdk_valid_monitor_port: set the monitor port for sysfs */ -enum fsl_diu_monitor_port +static enum fsl_diu_monitor_port p1022rdk_valid_monitor_port(enum fsl_diu_monitor_port port) { return FSL_DIU_PORT_DVI; @@ -96,7 +96,7 @@ p1022rdk_valid_monitor_port(enum fsl_diu_monitor_port port) #endif -void __init p1022_rdk_pic_init(void) +static void __init p1022_rdk_pic_init(void) { struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN | MPIC_SINGLE_DEST_CPU, diff --git a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c index baa12eff6d5d..60e0b8947ce6 100644 --- a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c +++ b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c @@ -8,6 +8,8 @@ #include #include +#include "socrates_fpga_pic.h" + /* * The FPGA supports 9 interrupt sources, which can be routed to 3 * interrupt request lines of the MPIC. The line to be used can be diff --git a/arch/powerpc/platforms/85xx/xes_mpc85xx.c b/arch/powerpc/platforms/85xx/xes_mpc85xx.c index 45f257fc1ade..2582427d8d01 100644 --- a/arch/powerpc/platforms/85xx/xes_mpc85xx.c +++ b/arch/powerpc/platforms/85xx/xes_mpc85xx.c @@ -37,7 +37,7 @@ #define MPC85xx_L2CTL_L2I 0x40000000 /* L2 flash invalidate */ #define MPC85xx_L2CTL_L2SIZ_MASK 0x30000000 /* L2 SRAM size (R/O) */ -void __init xes_mpc85xx_pic_init(void) +static void __init xes_mpc85xx_pic_init(void) { struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN, 0, 256, " OpenPIC "); -- cgit v1.2.3 From 17c8e9ac95d81d10e9619a845a68b83c9384be64 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 31 Jan 2024 13:05:13 +0100 Subject: RISC-V: paravirt: steal_time should be static steal_time is not used outside paravirt.c, make it static, as sparse suggested. Fixes: fdf68acccfc6 ("RISC-V: paravirt: Implement steal-time support") Signed-off-by: Andrew Jones Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/kernel/paravirt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kernel/paravirt.c b/arch/riscv/kernel/paravirt.c index 8e114f5930ce..15c5d4048db5 100644 --- a/arch/riscv/kernel/paravirt.c +++ b/arch/riscv/kernel/paravirt.c @@ -41,7 +41,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); -DEFINE_PER_CPU(struct sbi_sta_struct, steal_time) __aligned(64); +static DEFINE_PER_CPU(struct sbi_sta_struct, steal_time) __aligned(64); static bool __init has_pv_steal_clock(void) { -- cgit v1.2.3 From 3752219b6007ee0421cc228f442f33b31f85addd Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 31 Jan 2024 13:05:14 +0100 Subject: RISC-V: paravirt: Use correct restricted types __le32 and __le64 types should be used with le32_to_cpu() and le64_to_cpu(), as sparse helpfully points out. Fixes: fdf68acccfc6 ("RISC-V: paravirt: Implement steal-time support") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401011933.hL9zqmKo-lkp@intel.com/ Signed-off-by: Andrew Jones Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/kernel/paravirt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/paravirt.c b/arch/riscv/kernel/paravirt.c index 15c5d4048db5..0d6225fd3194 100644 --- a/arch/riscv/kernel/paravirt.c +++ b/arch/riscv/kernel/paravirt.c @@ -91,8 +91,8 @@ static int pv_time_cpu_down_prepare(unsigned int cpu) static u64 pv_time_steal_clock(int cpu) { struct sbi_sta_struct *st = per_cpu_ptr(&steal_time, cpu); - u32 sequence; - u64 steal; + __le32 sequence; + __le64 steal; /* * Check the sequence field before and after reading the steal -- cgit v1.2.3 From f072b272aa27d57cf7fe6fdedb30fb50f391974e Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 31 Jan 2024 13:05:15 +0100 Subject: RISC-V: KVM: Use correct restricted types __le32 and __le64 types should be used with le32_to_cpu() and le64_to_cpu() and __user is needed for pointers referencing guest memory, as sparse helpfully points out. Fixes: e9f12b5fff8a ("RISC-V: KVM: Implement SBI STA extension") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401020142.lwFEDK5v-lkp@intel.com/ Signed-off-by: Andrew Jones Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_sta.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c index 01f09fe8c3b0..d8cf9ca28c61 100644 --- a/arch/riscv/kvm/vcpu_sbi_sta.c +++ b/arch/riscv/kvm/vcpu_sbi_sta.c @@ -26,8 +26,12 @@ void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu) { gpa_t shmem = vcpu->arch.sta.shmem; u64 last_steal = vcpu->arch.sta.last_steal; - u32 *sequence_ptr, sequence; - u64 *steal_ptr, steal; + __le32 __user *sequence_ptr; + __le64 __user *steal_ptr; + __le32 sequence_le; + __le64 steal_le; + u32 sequence; + u64 steal; unsigned long hva; gfn_t gfn; @@ -47,22 +51,22 @@ void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu) return; } - sequence_ptr = (u32 *)(hva + offset_in_page(shmem) + + sequence_ptr = (__le32 __user *)(hva + offset_in_page(shmem) + offsetof(struct sbi_sta_struct, sequence)); - steal_ptr = (u64 *)(hva + offset_in_page(shmem) + + steal_ptr = (__le64 __user *)(hva + offset_in_page(shmem) + offsetof(struct sbi_sta_struct, steal)); - if (WARN_ON(get_user(sequence, sequence_ptr))) + if (WARN_ON(get_user(sequence_le, sequence_ptr))) return; - sequence = le32_to_cpu(sequence); + sequence = le32_to_cpu(sequence_le); sequence += 1; if (WARN_ON(put_user(cpu_to_le32(sequence), sequence_ptr))) return; - if (!WARN_ON(get_user(steal, steal_ptr))) { - steal = le64_to_cpu(steal); + if (!WARN_ON(get_user(steal_le, steal_ptr))) { + steal = le64_to_cpu(steal_le); vcpu->arch.sta.last_steal = READ_ONCE(current->sched_info.run_delay); steal += vcpu->arch.sta.last_steal - last_steal; WARN_ON(put_user(cpu_to_le64(steal), steal_ptr)); -- cgit v1.2.3 From 61da7c8e2a602f66be578cbbcebe8638c10e0f48 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 30 Jan 2024 15:43:53 +0000 Subject: arm64/signal: Don't assume that TIF_SVE means we saved SVE state When we are in a syscall we will only save the FPSIMD subset even though the task still has access to the full register set, and on context switch we will only remove TIF_SVE when loading the register state. This means that the signal handling code should not assume that TIF_SVE means that the register state is stored in SVE format, it should instead check the format that was recorded during save. Fixes: 8c845e273104 ("arm64/sve: Leave SVE enabled on syscall if we don't context switch") Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240130-arm64-sve-signal-regs-v2-1-9fc6f9502782@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/fpsimd.c | 2 +- arch/arm64/kernel/signal.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index a5dc6f764195..25ceaee6b025 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1635,7 +1635,7 @@ void fpsimd_preserve_current_state(void) void fpsimd_signal_preserve_current_state(void) { fpsimd_preserve_current_state(); - if (test_thread_flag(TIF_SVE)) + if (current->thread.fp_type == FP_STATE_SVE) sve_to_fpsimd(current); } diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 0e8beb3349ea..425b1bc17a3f 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -242,7 +242,7 @@ static int preserve_sve_context(struct sve_context __user *ctx) vl = task_get_sme_vl(current); vq = sve_vq_from_vl(vl); flags |= SVE_SIG_FLAG_SM; - } else if (test_thread_flag(TIF_SVE)) { + } else if (current->thread.fp_type == FP_STATE_SVE) { vq = sve_vq_from_vl(vl); } @@ -878,7 +878,7 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, if (system_supports_sve() || system_supports_sme()) { unsigned int vq = 0; - if (add_all || test_thread_flag(TIF_SVE) || + if (add_all || current->thread.fp_type == FP_STATE_SVE || thread_sm_enabled(¤t->thread)) { int vl = max(sve_max_vl(), sme_max_vl()); -- cgit v1.2.3 From c0b26c06ed008c0898829dda4c2783b8ec478350 Mon Sep 17 00:00:00 2001 From: Seongsu Park Date: Fri, 2 Feb 2024 10:33:06 +0900 Subject: arm64: fix typo in comments fix typo in comments thath -> that Signed-off-by: Seongsu Park Link: https://lore.kernel.org/r/20240202013306.883777-1-sgsu.park@samsung.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 21c824edf8ce..bd8d4ca81a48 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -83,7 +83,7 @@ struct arm64_ftr_bits { * to full-0 denotes that this field has no override * * A @mask field set to full-0 with the corresponding @val field set - * to full-1 denotes thath this field has an invalid override. + * to full-1 denotes that this field has an invalid override. */ struct arm64_ftr_override { u64 val; -- cgit v1.2.3 From f9daab0ad01cf9d165dbbbf106ca4e61d06e7fe8 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 5 Feb 2024 23:45:52 -0800 Subject: arm64: jump_label: use constraints "Si" instead of "i" The generic constraint "i" seems to be copied from x86 or arm (and with a redundant generic operand modifier "c"). It works with -fno-PIE but not with -fPIE/-fPIC in GCC's aarch64 port. The machine constraint "S", which denotes a symbol or label reference with a constant offset, supports PIC and has been available in GCC since 2012 and in Clang since 7.0. However, Clang before 19 does not support "S" on a symbol with a constant offset [1] (e.g. `static_key_false(&nf_hooks_needed[pf][hook])` in include/linux/netfilter.h), so we use "i" as a fallback. Suggested-by: Ard Biesheuvel Signed-off-by: Fangrui Song Link: https://github.com/llvm/llvm-project/pull/80255 [1] Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20240206074552.541154-1-maskray@google.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/jump_label.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h index 48ddc0f45d22..b7716b215f91 100644 --- a/arch/arm64/include/asm/jump_label.h +++ b/arch/arm64/include/asm/jump_label.h @@ -15,6 +15,10 @@ #define JUMP_LABEL_NOP_SIZE AARCH64_INSN_SIZE +/* + * Prefer the constraint "S" to support PIC with GCC. Clang before 19 does not + * support "S" on a symbol with a constant offset, so we use "i" as a fallback. + */ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { @@ -23,9 +27,9 @@ static __always_inline bool arch_static_branch(struct static_key * const key, " .pushsection __jump_table, \"aw\" \n\t" " .align 3 \n\t" " .long 1b - ., %l[l_yes] - . \n\t" - " .quad %c0 - . \n\t" + " .quad (%[key] - .) + %[bit0] \n\t" " .popsection \n\t" - : : "i"(&((char *)key)[branch]) : : l_yes); + : : [key]"Si"(key), [bit0]"i"(branch) : : l_yes); return false; l_yes: @@ -40,9 +44,9 @@ static __always_inline bool arch_static_branch_jump(struct static_key * const ke " .pushsection __jump_table, \"aw\" \n\t" " .align 3 \n\t" " .long 1b - ., %l[l_yes] - . \n\t" - " .quad %c0 - . \n\t" + " .quad (%[key] - .) + %[bit0] \n\t" " .popsection \n\t" - : : "i"(&((char *)key)[branch]) : : l_yes); + : : [key]"Si"(key), [bit0]"i"(branch) : : l_yes); return false; l_yes: -- cgit v1.2.3 From d794734c9bbfe22f86686dc2909c25f5ffe1a572 Mon Sep 17 00:00:00 2001 From: Steve Wahl Date: Fri, 26 Jan 2024 10:48:41 -0600 Subject: x86/mm/ident_map: Use gbpages only where full GB page should be mapped. When ident_pud_init() uses only gbpages to create identity maps, large ranges of addresses not actually requested can be included in the resulting table; a 4K request will map a full GB. On UV systems, this ends up including regions that will cause hardware to halt the system if accessed (these are marked "reserved" by BIOS). Even processor speculation into these regions is enough to trigger the system halt. Only use gbpages when map creation requests include the full GB page of space. Fall back to using smaller 2M pages when only portions of a GB page are included in the request. No attempt is made to coalesce mapping requests. If a request requires a map entry at the 2M (pmd) level, subsequent mapping requests within the same 1G region will also be at the pmd level, even if adjacent or overlapping such requests could have been combined to map a full gbpage. Existing usage starts with larger regions and then adds smaller regions, so this should not have any great consequence. [ dhansen: fix up comment formatting, simplifty changelog ] Signed-off-by: Steve Wahl Signed-off-by: Dave Hansen Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240126164841.170866-1-steve.wahl%40hpe.com --- arch/x86/mm/ident_map.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 968d7005f4a7..f50cc210a981 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -26,18 +26,31 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, for (; addr < end; addr = next) { pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + bool use_gbpage; next = (addr & PUD_MASK) + PUD_SIZE; if (next > end) next = end; - if (info->direct_gbpages) { - pud_t pudval; + /* if this is already a gbpage, this portion is already mapped */ + if (pud_large(*pud)) + continue; + + /* Is using a gbpage allowed? */ + use_gbpage = info->direct_gbpages; - if (pud_present(*pud)) - continue; + /* Don't use gbpage if it maps more than the requested region. */ + /* at the begining: */ + use_gbpage &= ((addr & ~PUD_MASK) == 0); + /* ... or at the end: */ + use_gbpage &= ((next & ~PUD_MASK) == 0); + + /* Never overwrite existing mappings */ + use_gbpage &= !pud_present(*pud); + + if (use_gbpage) { + pud_t pudval; - addr &= PUD_MASK; pudval = __pud((addr - info->offset) | info->page_flag); set_pud(pud, pudval); continue; -- cgit v1.2.3 From c60d847be7b8e69e419e02a2b3d19c2842a3c35d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 12 Feb 2024 19:30:52 +0000 Subject: KVM: arm64: Fix double-free following kvm_pgtable_stage2_free_unlinked() kvm_pgtable_stage2_free_unlinked() does the final put_page() on the root page of the sub-tree before returning, so remove the additional put_page() invocations in the callers. Cc: Ricardo Koller Fixes: f6a27d6dc51b2 ("KVM: arm64: Drop last page ref in kvm_pgtable_stage2_free_removed()") Signed-off-by: Will Deacon Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20240212193052.27765-1-will@kernel.org --- arch/arm64/kvm/hyp/pgtable.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index c651df904fe3..ab9d05fcf98b 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1419,7 +1419,6 @@ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, level + 1); if (ret) { kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); - mm_ops->put_page(pgtable); return ERR_PTR(ret); } @@ -1502,7 +1501,6 @@ static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_try_break_pte(ctx, mmu)) { kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); - mm_ops->put_page(childp); return -EAGAIN; } -- cgit v1.2.3 From 1fba2bf8e9d5a27b7394856181b6200de7260b79 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 14 Feb 2024 11:00:41 +1100 Subject: Revert "powerpc/pseries/iommu: Fix iommu initialisation during DLPAR add" This reverts commit ed8b94f6e0acd652ce69bd69d678a0c769172df8. Gaurav reported that there are still problems with the patch and it should be reverted pending a fuller fix. Link: https://lore.kernel.org/all/4f6fc1ac-7a76-4447-9d0e-f55c0be373f8@linux.ibm.com/ Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/ppc-pci.h | 3 --- arch/powerpc/kernel/iommu.c | 21 +++++---------------- arch/powerpc/platforms/pseries/pci_dlpar.c | 4 ---- 3 files changed, 5 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index c3a3f3df36d1..ce2b1b5eebdd 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -29,9 +29,6 @@ void *pci_traverse_device_nodes(struct device_node *start, void *(*fn)(struct device_node *, void *), void *data); extern void pci_devs_phb_init_dynamic(struct pci_controller *phb); -extern void ppc_iommu_register_device(struct pci_controller *phb); -extern void ppc_iommu_unregister_device(struct pci_controller *phb); - /* From rtas_pci.h */ extern void init_pci_config_tokens (void); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c6f62e130d55..ebe259bdd462 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1388,21 +1388,6 @@ static const struct attribute_group *spapr_tce_iommu_groups[] = { NULL, }; -void ppc_iommu_register_device(struct pci_controller *phb) -{ - iommu_device_sysfs_add(&phb->iommu, phb->parent, - spapr_tce_iommu_groups, "iommu-phb%04x", - phb->global_number); - iommu_device_register(&phb->iommu, &spapr_tce_iommu_ops, - phb->parent); -} - -void ppc_iommu_unregister_device(struct pci_controller *phb) -{ - iommu_device_unregister(&phb->iommu); - iommu_device_sysfs_remove(&phb->iommu); -} - /* * This registers IOMMU devices of PHBs. This needs to happen * after core_initcall(iommu_init) + postcore_initcall(pci_driver_init) and @@ -1413,7 +1398,11 @@ static int __init spapr_tce_setup_phb_iommus_initcall(void) struct pci_controller *hose; list_for_each_entry(hose, &hose_list, list_node) { - ppc_iommu_register_device(hose); + iommu_device_sysfs_add(&hose->iommu, hose->parent, + spapr_tce_iommu_groups, "iommu-phb%04x", + hose->global_number); + iommu_device_register(&hose->iommu, &spapr_tce_iommu_ops, + hose->parent); } return 0; } diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 4448386268d9..4ba824568119 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -35,8 +35,6 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) pseries_msi_allocate_domains(phb); - ppc_iommu_register_device(phb); - /* Create EEH devices for the PHB */ eeh_phb_pe_create(phb); @@ -78,8 +76,6 @@ int remove_phb_dynamic(struct pci_controller *phb) } } - ppc_iommu_unregister_device(phb); - pseries_msi_free_domains(phb); /* Keep a reference so phb isn't freed yet */ -- cgit v1.2.3 From f1acb109505d983779bbb7e20a1ee6244d2b5736 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 12 Feb 2024 17:42:44 +1100 Subject: powerpc/kasan: Limit KASAN thread size increase to 32KB KASAN is seen to increase stack usage, to the point that it was reported to lead to stack overflow on some 32-bit machines (see link). To avoid overflows the stack size was doubled for KASAN builds in commit 3e8635fb2e07 ("powerpc/kasan: Force thread size increase with KASAN"). However with a 32KB stack size to begin with, the doubling leads to a 64KB stack, which causes build errors: arch/powerpc/kernel/switch.S:249: Error: operand out of range (0x000000000000fe50 is not between 0xffffffffffff8000 and 0x0000000000007fff) Although the asm could be reworked, in practice a 32KB stack seems sufficient even for KASAN builds - the additional usage seems to be in the 2-3KB range for a 64-bit KASAN build. So only increase the stack for KASAN if the stack size is < 32KB. Fixes: 18f14afe2816 ("powerpc/64s: Increase default stack size to 32KB") Reported-by: Spoorthy Reported-by: Benjamin Gray Reviewed-by: Benjamin Gray Link: https://lore.kernel.org/linuxppc-dev/bug-207129-206035@https.bugzilla.kernel.org%2F/ Signed-off-by: Michael Ellerman Link: https://msgid.link/20240212064244.3924505-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/thread_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index bf5dde1a4114..15c5691dd218 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -14,7 +14,7 @@ #ifdef __KERNEL__ -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN) && CONFIG_THREAD_SHIFT < 15 #define MIN_THREAD_SHIFT (CONFIG_THREAD_SHIFT + 1) #else #define MIN_THREAD_SHIFT CONFIG_THREAD_SHIFT -- cgit v1.2.3 From eb6d871f4ba49ac8d0537e051fe983a3a4027f61 Mon Sep 17 00:00:00 2001 From: David Engraf Date: Wed, 7 Feb 2024 10:27:58 +0100 Subject: powerpc/cputable: Add missing PPC_FEATURE_BOOKE on PPC64 Book-E Commit e320a76db4b0 ("powerpc/cputable: Split cpu_specs[] out of cputable.h") moved the cpu_specs to separate header files. Previously PPC_FEATURE_BOOKE was enabled by CONFIG_PPC_BOOK3E_64. The definition in cpu_specs_e500mc.h for PPC64 no longer enables PPC_FEATURE_BOOKE. This breaks user space reading the ELF hwcaps and expect PPC_FEATURE_BOOKE. Debugging an application with gdb is no longer working on e5500/e6500 because the 64-bit detection relies on PPC_FEATURE_BOOKE for Book-E. Fixes: e320a76db4b0 ("powerpc/cputable: Split cpu_specs[] out of cputable.h") Cc: stable@vger.kernel.org # v6.1+ Signed-off-by: David Engraf Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/20240207092758.1058893-1-david.engraf@sysgo.com --- arch/powerpc/kernel/cpu_specs_e500mc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/cpu_specs_e500mc.h b/arch/powerpc/kernel/cpu_specs_e500mc.h index ceb06b109f83..2ae8e9a7b461 100644 --- a/arch/powerpc/kernel/cpu_specs_e500mc.h +++ b/arch/powerpc/kernel/cpu_specs_e500mc.h @@ -8,7 +8,8 @@ #ifdef CONFIG_PPC64 #define COMMON_USER_BOOKE (PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \ - PPC_FEATURE_HAS_FPU | PPC_FEATURE_64) + PPC_FEATURE_HAS_FPU | PPC_FEATURE_64 | \ + PPC_FEATURE_BOOKE) #else #define COMMON_USER_BOOKE (PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \ PPC_FEATURE_BOOKE) -- cgit v1.2.3 From ea73179e64131bcd29ba6defd33732abdf8ca14b Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Tue, 13 Feb 2024 23:24:10 +0530 Subject: powerpc/ftrace: Ignore ftrace locations in exit text sections Michael reported that we are seeing an ftrace bug on bootup when KASAN is enabled and we are using -fpatchable-function-entry: ftrace: allocating 47780 entries in 18 pages ftrace-powerpc: 0xc0000000020b3d5c: No module provided for non-kernel address ------------[ ftrace bug ]------------ ftrace faulted on modifying [] 0xc0000000020b3d5c Initializing ftrace call sites ftrace record flags: 0 (0) expected tramp: c00000000008cef4 ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at kernel/trace/ftrace.c:2180 ftrace_bug+0x3c0/0x424 Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 6.5.0-rc3-00120-g0f71dcfb4aef #860 Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf000005 of:SLOF,HEAD hv:linux,kvm pSeries NIP: c0000000003aa81c LR: c0000000003aa818 CTR: 0000000000000000 REGS: c0000000033cfab0 TRAP: 0700 Not tainted (6.5.0-rc3-00120-g0f71dcfb4aef) MSR: 8000000002021033 CR: 28028240 XER: 00000000 CFAR: c0000000002781a8 IRQMASK: 3 ... NIP [c0000000003aa81c] ftrace_bug+0x3c0/0x424 LR [c0000000003aa818] ftrace_bug+0x3bc/0x424 Call Trace: ftrace_bug+0x3bc/0x424 (unreliable) ftrace_process_locs+0x5f4/0x8a0 ftrace_init+0xc0/0x1d0 start_kernel+0x1d8/0x484 With CONFIG_FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY=y and CONFIG_KASAN=y, compiler emits nops in functions that it generates for registering and unregistering global variables (unlike with -pg and -mprofile-kernel where calls to _mcount() are not generated in those functions). Those functions then end up in INIT_TEXT and EXIT_TEXT respectively. We don't expect to see any profiled functions in EXIT_TEXT, so ftrace_init_nop() assumes that all addresses that aren't in the core kernel text belongs to a module. Since these functions do not match that criteria, we see the above bug. Address this by having ftrace ignore all locations in the text exit sections of vmlinux. Fixes: 0f71dcfb4aef ("powerpc/ftrace: Add support for -fpatchable-function-entry") Cc: stable@vger.kernel.org # v6.6+ Reported-by: Michael Ellerman Signed-off-by: Naveen N Rao Reviewed-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20240213175410.1091313-1-naveen@kernel.org --- arch/powerpc/include/asm/ftrace.h | 10 ++-------- arch/powerpc/include/asm/sections.h | 1 + arch/powerpc/kernel/trace/ftrace.c | 12 ++++++++++++ arch/powerpc/kernel/trace/ftrace_64_pg.c | 5 +++++ arch/powerpc/kernel/vmlinux.lds.S | 2 ++ 5 files changed, 22 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 1ebd2ca97f12..107fc5a48456 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -20,14 +20,6 @@ #ifndef __ASSEMBLY__ extern void _mcount(void); -static inline unsigned long ftrace_call_adjust(unsigned long addr) -{ - if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY)) - addr += MCOUNT_INSN_SIZE; - - return addr; -} - unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, unsigned long sp); @@ -142,8 +134,10 @@ static inline u8 this_cpu_get_ftrace_enabled(void) { return 1; } #ifdef CONFIG_FUNCTION_TRACER extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[]; void ftrace_free_init_tramp(void); +unsigned long ftrace_call_adjust(unsigned long addr); #else static inline void ftrace_free_init_tramp(void) { } +static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; } #endif #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index ea26665f82cf..f43f3a6b0051 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -14,6 +14,7 @@ typedef struct func_desc func_desc_t; extern char __head_end[]; extern char __srwx_boundary[]; +extern char __exittext_begin[], __exittext_end[]; /* Patch sites */ extern s32 patch__call_flush_branch_caches1; diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 82010629cf88..d8d6b4fd9a14 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -27,10 +27,22 @@ #include #include #include +#include #define NUM_FTRACE_TRAMPS 2 static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS]; +unsigned long ftrace_call_adjust(unsigned long addr) +{ + if (addr >= (unsigned long)__exittext_begin && addr < (unsigned long)__exittext_end) + return 0; + + if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY)) + addr += MCOUNT_INSN_SIZE; + + return addr; +} + static ppc_inst_t ftrace_create_branch_inst(unsigned long ip, unsigned long addr, int link) { ppc_inst_t op; diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c index 7b85c3b460a3..12fab1803bcf 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_pg.c +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c @@ -37,6 +37,11 @@ #define NUM_FTRACE_TRAMPS 8 static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS]; +unsigned long ftrace_call_adjust(unsigned long addr) +{ + return addr; +} + static ppc_inst_t ftrace_call_replace(unsigned long ip, unsigned long addr, int link) { diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 1c5970df3233..f420df7888a7 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -281,7 +281,9 @@ SECTIONS * to deal with references from __bug_table */ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + __exittext_begin = .; EXIT_TEXT + __exittext_end = .; } . = ALIGN(PAGE_SIZE); -- cgit v1.2.3 From cbecc9fcbbec60136b0180ba0609c829afed5c81 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Tue, 13 Feb 2024 10:56:35 +0530 Subject: powerpc/pseries: fix accuracy of stolen time powerVM hypervisor updates the VPA fields with stolen time data. It currently reports enqueue_dispatch_tb and ready_enqueue_tb for this purpose. In linux these two fields are used to report the stolen time. The VPA fields are updated at the TB frequency. On powerPC its mostly set at 512Mhz. Hence this needs a conversion to ns when reporting it back as rest of the kernel timings are in ns. This conversion is already handled in tb_to_ns function. So use that function to report accurate stolen time. Observed this issue and used an Capped Shared Processor LPAR(SPLPAR) to simplify the experiments. In all these cases, 100% VP Load is run using stress-ng workload. Values of stolen time is in percentages as reported by mpstat. With the patch values are close to expected. 6.8.rc1 +Patch 12EC/12VP 0.0 0.0 12EC/24VP 25.7 50.2 12EC/36VP 37.3 69.2 12EC/48VP 38.5 78.3 Fixes: 0e8a63132800 ("powerpc/pseries: Implement CONFIG_PARAVIRT_TIME_ACCOUNTING") Cc: stable@vger.kernel.org # v6.1+ Signed-off-by: Shrikanth Hegde Reviewed-by: Nicholas Piggin Reviewed-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20240213052635.231597-1-sshegde@linux.ibm.com --- arch/powerpc/platforms/pseries/lpar.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 4561667832ed..4e9916bb03d7 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -662,8 +662,12 @@ u64 pseries_paravirt_steal_clock(int cpu) { struct lppaca *lppaca = &lppaca_of(cpu); - return be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb)) + - be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb)); + /* + * VPA steal time counters are reported at TB frequency. Hence do a + * conversion to ns before returning + */ + return tb_to_ns(be64_to_cpu(READ_ONCE(lppaca->enqueue_dispatch_tb)) + + be64_to_cpu(READ_ONCE(lppaca->ready_enqueue_tb))); } #endif -- cgit v1.2.3 From 0846dd77c8349ec92ca0079c9c71d130f34cb192 Mon Sep 17 00:00:00 2001 From: Shivaprasad G Bhat Date: Tue, 13 Feb 2024 10:05:22 -0600 Subject: powerpc/iommu: Fix the missing iommu_group_put() during platform domain attach The function spapr_tce_platform_iommu_attach_dev() is missing to call iommu_group_put() when the domain is already set. This refcount leak shows up with BUG_ON() during DLPAR remove operation as: KernelBug: Kernel bug in state 'None': kernel BUG at arch/powerpc/platforms/pseries/iommu.c:100! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=8192 NUMA pSeries Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_016) hv:phyp pSeries NIP: c0000000000ff4d4 LR: c0000000000ff4cc CTR: 0000000000000000 REGS: c0000013aed5f840 TRAP: 0700 Tainted: G I (6.8.0-rc3-autotest-g99bd3cb0d12e) MSR: 8000000000029033 CR: 44002402 XER: 20040000 CFAR: c000000000a0d170 IRQMASK: 0 ... NIP iommu_reconfig_notifier+0x94/0x200 LR iommu_reconfig_notifier+0x8c/0x200 Call Trace: iommu_reconfig_notifier+0x8c/0x200 (unreliable) notifier_call_chain+0xb8/0x19c blocking_notifier_call_chain+0x64/0x98 of_reconfig_notify+0x44/0xdc of_detach_node+0x78/0xb0 ofdt_write.part.0+0x86c/0xbb8 proc_reg_write+0xf4/0x150 vfs_write+0xf8/0x488 ksys_write+0x84/0x140 system_call_exception+0x138/0x330 system_call_vectored_common+0x15c/0x2ec The patch adds the missing iommu_group_put() call. Fixes: a8ca9fc9134c ("powerpc/iommu: Do not do platform domain attach atctions after probe") Reported-by: Venkat Rao Bagalkote Closes: https://lore.kernel.org/all/274e0d2b-b5cc-475e-94e6-8427e88e271d@linux.vnet.ibm.com/ Signed-off-by: Shivaprasad G Bhat Tested-by: Venkat Rao Bagalkote Reviewed-by: Jason Gunthorpe Signed-off-by: Michael Ellerman Link: https://msgid.link/170784021983.6249.10039296655906636112.stgit@linux.ibm.com --- arch/powerpc/kernel/iommu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index ebe259bdd462..df17b33b89d1 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1290,8 +1290,10 @@ spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, int ret = -EINVAL; /* At first attach the ownership is already set */ - if (!domain) + if (!domain) { + iommu_group_put(grp); return 0; + } if (!grp) return -ENODEV; -- cgit v1.2.3 From 3b9ab248bc45abf8c2365ed3eec86cdefd4d626a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 2 Feb 2024 10:31:42 +0900 Subject: kbuild: use 4-space indentation when followed by conditionals GNU Make manual [1] clearly forbids a tab at the beginning of the conditional directive line: "Extra spaces are allowed and ignored at the beginning of the conditional directive line, but a tab is not allowed." This will not work for the next release of GNU Make, hence commit 82175d1f9430 ("kbuild: Replace tabs with spaces when followed by conditionals") replaced the inappropriate tabs with 8 spaces. However, the 8-space indentation cannot be visually distinguished. Linus suggested 2-4 spaces for those nested if-statements. [2] This commit redoes the replacement with 4 spaces. [1]: https://www.gnu.org/software/make/manual/make.html#Conditional-Syntax [2]: https://lore.kernel.org/all/CAHk-=whJKZNZWsa-VNDKafS_VfY4a5dAjG-r8BZgWk_a-xSepw@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Masahiro Yamada --- Makefile | 12 ++++++------ arch/m68k/Makefile | 4 ++-- arch/parisc/Makefile | 4 ++-- arch/x86/Makefile | 8 ++++---- 4 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/Makefile b/Makefile index 7e0b2ad98905..ed80c7d98a7e 100644 --- a/Makefile +++ b/Makefile @@ -294,15 +294,15 @@ may-sync-config := 1 single-build := ifneq ($(filter $(no-dot-config-targets), $(MAKECMDGOALS)),) - ifeq ($(filter-out $(no-dot-config-targets), $(MAKECMDGOALS)),) + ifeq ($(filter-out $(no-dot-config-targets), $(MAKECMDGOALS)),) need-config := - endif + endif endif ifneq ($(filter $(no-sync-config-targets), $(MAKECMDGOALS)),) - ifeq ($(filter-out $(no-sync-config-targets), $(MAKECMDGOALS)),) + ifeq ($(filter-out $(no-sync-config-targets), $(MAKECMDGOALS)),) may-sync-config := - endif + endif endif need-compiler := $(may-sync-config) @@ -323,9 +323,9 @@ endif # We cannot build single targets and the others at the same time ifneq ($(filter $(single-targets), $(MAKECMDGOALS)),) single-build := 1 - ifneq ($(filter-out $(single-targets), $(MAKECMDGOALS)),) + ifneq ($(filter-out $(single-targets), $(MAKECMDGOALS)),) mixed-build := 1 - endif + endif endif # For "make -j clean all", "make -j mrproper defconfig all", etc. diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile index 76ef1a67c361..0abcf994ce55 100644 --- a/arch/m68k/Makefile +++ b/arch/m68k/Makefile @@ -15,10 +15,10 @@ KBUILD_DEFCONFIG := multi_defconfig ifdef cross_compiling - ifeq ($(CROSS_COMPILE),) + ifeq ($(CROSS_COMPILE),) CROSS_COMPILE := $(call cc-cross-prefix, \ m68k-linux-gnu- m68k-linux- m68k-unknown-linux-gnu-) - endif + endif endif # diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index 7486b3b30594..316f84f1d15c 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -50,12 +50,12 @@ export CROSS32CC # Set default cross compiler for kernel build ifdef cross_compiling - ifeq ($(CROSS_COMPILE),) + ifeq ($(CROSS_COMPILE),) CC_SUFFIXES = linux linux-gnu unknown-linux-gnu suse-linux CROSS_COMPILE := $(call cc-cross-prefix, \ $(foreach a,$(CC_ARCHES), \ $(foreach s,$(CC_SUFFIXES),$(a)-$(s)-))) - endif + endif endif ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2264db14a25d..da8f3caf2781 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -112,13 +112,13 @@ ifeq ($(CONFIG_X86_32),y) # temporary until string.h is fixed KBUILD_CFLAGS += -ffreestanding - ifeq ($(CONFIG_STACKPROTECTOR),y) - ifeq ($(CONFIG_SMP),y) + ifeq ($(CONFIG_STACKPROTECTOR),y) + ifeq ($(CONFIG_SMP),y) KBUILD_CFLAGS += -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - else + else KBUILD_CFLAGS += -mstack-protector-guard=global - endif endif + endif else BITS := 64 UTS_MACHINE := x86_64 -- cgit v1.2.3 From fb091ff394792c018527b3211bbdfae93ea4ac02 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Wed, 14 Feb 2024 17:55:18 +0000 Subject: arm64: Subscribe Microsoft Azure Cobalt 100 to ARM Neoverse N2 errata Add the MIDR value of Microsoft Azure Cobalt 100, which is a Microsoft implemented CPU based on r0p0 of the ARM Neoverse N2 CPU, and therefore suffers from all the same errata. CC: stable@vger.kernel.org # 5.15+ Signed-off-by: Easwar Hariharan Reviewed-by: Anshuman Khandual Acked-by: Mark Rutland Acked-by: Marc Zyngier Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20240214175522.2457857-1-eahariha@linux.microsoft.com Signed-off-by: Will Deacon --- Documentation/arch/arm64/silicon-errata.rst | 7 +++++++ arch/arm64/include/asm/cputype.h | 4 ++++ arch/arm64/kernel/cpu_errata.c | 3 +++ 3 files changed, 14 insertions(+) (limited to 'arch') diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index e8c2ce1f9df6..45a7f4932fe0 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -243,3 +243,10 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ASR | ASR8601 | #8601001 | N/A | +----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ +| Microsoft | Azure Cobalt 100| #2139208 | ARM64_ERRATUM_2139208 | ++----------------+-----------------+-----------------+-----------------------------+ +| Microsoft | Azure Cobalt 100| #2067961 | ARM64_ERRATUM_2067961 | ++----------------+-----------------+-----------------+-----------------------------+ +| Microsoft | Azure Cobalt 100| #2253138 | ARM64_ERRATUM_2253138 | ++----------------+-----------------+-----------------+-----------------------------+ diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 7c7493cb571f..52f076afeb96 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -61,6 +61,7 @@ #define ARM_CPU_IMP_HISI 0x48 #define ARM_CPU_IMP_APPLE 0x61 #define ARM_CPU_IMP_AMPERE 0xC0 +#define ARM_CPU_IMP_MICROSOFT 0x6D #define ARM_CPU_PART_AEM_V8 0xD0F #define ARM_CPU_PART_FOUNDATION 0xD00 @@ -135,6 +136,8 @@ #define AMPERE_CPU_PART_AMPERE1 0xAC3 +#define MICROSOFT_CPU_PART_AZURE_COBALT_100 0xD49 /* Based on r0p0 of ARM Neoverse N2 */ + #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) @@ -193,6 +196,7 @@ #define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX) #define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) +#define MIDR_MICROSOFT_AZURE_COBALT_100 MIDR_CPU_MODEL(ARM_CPU_IMP_MICROSOFT, MICROSOFT_CPU_PART_AZURE_COBALT_100) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ #define MIDR_FUJITSU_ERRATUM_010001 MIDR_FUJITSU_A64FX diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 967c7c7a4e7d..76b8dd37092a 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -374,6 +374,7 @@ static const struct midr_range erratum_1463225[] = { static const struct midr_range trbe_overwrite_fill_mode_cpus[] = { #ifdef CONFIG_ARM64_ERRATUM_2139208 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_MICROSOFT_AZURE_COBALT_100), #endif #ifdef CONFIG_ARM64_ERRATUM_2119858 MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), @@ -387,6 +388,7 @@ static const struct midr_range trbe_overwrite_fill_mode_cpus[] = { static const struct midr_range tsb_flush_fail_cpus[] = { #ifdef CONFIG_ARM64_ERRATUM_2067961 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_MICROSOFT_AZURE_COBALT_100), #endif #ifdef CONFIG_ARM64_ERRATUM_2054223 MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), @@ -399,6 +401,7 @@ static const struct midr_range tsb_flush_fail_cpus[] = { static struct midr_range trbe_write_out_of_range_cpus[] = { #ifdef CONFIG_ARM64_ERRATUM_2253138 MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), + MIDR_ALL_VERSIONS(MIDR_MICROSOFT_AZURE_COBALT_100), #endif #ifdef CONFIG_ARM64_ERRATUM_2224489 MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), -- cgit v1.2.3 From 2813926261e436d33bc74486b51cce60b76edf78 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 13 Feb 2024 18:24:38 +0000 Subject: arm64/sve: Lower the maximum allocation for the SVE ptrace regset Doug Anderson observed that ChromeOS crashes are being reported which include failing allocations of order 7 during core dumps due to ptrace allocating storage for regsets: chrome: page allocation failure: order:7, mode:0x40dc0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),cpuset=urgent,mems_allowed=0 ... regset_get_alloc+0x1c/0x28 elf_core_dump+0x3d8/0xd8c do_coredump+0xeb8/0x1378 with further investigation showing that this is: [ 66.957385] DOUG: Allocating 279584 bytes which is the maximum size of the SVE regset. As Doug observes it is not entirely surprising that such a large allocation of contiguous memory might fail on a long running system. The SVE regset is currently sized to hold SVE registers with a VQ of SVE_VQ_MAX which is 512, substantially more than the architectural maximum of 16 which we might see even in a system emulating the limits of the architecture. Since we don't expose the size we tell the regset core externally let's define ARCH_SVE_VQ_MAX with the actual architectural maximum and use that for the regset, we'll still overallocate most of the time but much less so which will be helpful even if the core is fixed to not require contiguous allocations. Specify ARCH_SVE_VQ_MAX in terms of the maximum value that can be written into ZCR_ELx.LEN (where this is set in the hardware). For consistency update the maximum SME vector length to be specified in the same style while we are at it. We could also teach the ptrace core about runtime discoverable regset sizes but that would be a more invasive change and this is being observed in practical systems. Reported-by: Doug Anderson Signed-off-by: Mark Brown Tested-by: Douglas Anderson Link: https://lore.kernel.org/r/20240213-arm64-sve-ptrace-regset-size-v2-1-c7600ca74b9b@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/fpsimd.h | 12 ++++++------ arch/arm64/kernel/ptrace.c | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 50e5f25d3024..481d94416d69 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -62,13 +62,13 @@ static inline void cpacr_restore(unsigned long cpacr) * When we defined the maximum SVE vector length we defined the ABI so * that the maximum vector length included all the reserved for future * expansion bits in ZCR rather than those just currently defined by - * the architecture. While SME follows a similar pattern the fact that - * it includes a square matrix means that any allocations that attempt - * to cover the maximum potential vector length (such as happen with - * the regset used for ptrace) end up being extremely large. Define - * the much lower actual limit for use in such situations. + * the architecture. Using this length to allocate worst size buffers + * results in excessively large allocations, and this effect is even + * more pronounced for SME due to ZA. Define more suitable VLs for + * these situations. */ -#define SME_VQ_MAX 16 +#define ARCH_SVE_VQ_MAX ((ZCR_ELx_LEN_MASK >> ZCR_ELx_LEN_SHIFT) + 1) +#define SME_VQ_MAX ((SMCR_ELx_LEN_MASK >> SMCR_ELx_LEN_SHIFT) + 1) struct task_struct; diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index dc6cf0e37194..e3bef38fc2e2 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1500,7 +1500,8 @@ static const struct user_regset aarch64_regsets[] = { #ifdef CONFIG_ARM64_SVE [REGSET_SVE] = { /* Scalable Vector Extension */ .core_note_type = NT_ARM_SVE, - .n = DIV_ROUND_UP(SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE), + .n = DIV_ROUND_UP(SVE_PT_SIZE(ARCH_SVE_VQ_MAX, + SVE_PT_REGS_SVE), SVE_VQ_BYTES), .size = SVE_VQ_BYTES, .align = SVE_VQ_BYTES, -- cgit v1.2.3 From ee0e39a63b78849f8abbef268b13e4838569f646 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 2 Feb 2024 18:39:33 +0800 Subject: x86/mm: Move is_vsyscall_vaddr() into asm/vsyscall.h Move is_vsyscall_vaddr() into asm/vsyscall.h to make it available for copy_from_kernel_nofault_allowed() in arch/x86/mm/maccess.c. Reviewed-by: Sohil Mehta Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20240202103935.3154011-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/x86/include/asm/vsyscall.h | 10 ++++++++++ arch/x86/mm/fault.c | 9 --------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index ab60a71a8dcb..472f0263dbc6 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -4,6 +4,7 @@ #include #include +#include #ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); @@ -24,4 +25,13 @@ static inline bool emulate_vsyscall(unsigned long error_code, } #endif +/* + * The (legacy) vsyscall page is the long page in the kernel portion + * of the address space that has user-accessible permissions. + */ +static inline bool is_vsyscall_vaddr(unsigned long vaddr) +{ + return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); +} + #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 679b09cfe241..d6375b3c633b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -798,15 +798,6 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, show_opcodes(regs, loglvl); } -/* - * The (legacy) vsyscall page is the long page in the kernel portion - * of the address space that has user-accessible permissions. - */ -static bool is_vsyscall_vaddr(unsigned long vaddr) -{ - return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); -} - static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) -- cgit v1.2.3 From 32019c659ecfe1d92e3bf9fcdfbb11a7c70acd58 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 2 Feb 2024 18:39:34 +0800 Subject: x86/mm: Disallow vsyscall page read for copy_from_kernel_nofault() When trying to use copy_from_kernel_nofault() to read vsyscall page through a bpf program, the following oops was reported: BUG: unable to handle page fault for address: ffffffffff600000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 3231067 P4D 3231067 PUD 3233067 PMD 3235067 PTE 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 20390 Comm: test_progs ...... 6.7.0+ #58 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...... RIP: 0010:copy_from_kernel_nofault+0x6f/0x110 ...... Call Trace: ? copy_from_kernel_nofault+0x6f/0x110 bpf_probe_read_kernel+0x1d/0x50 bpf_prog_2061065e56845f08_do_probe_read+0x51/0x8d trace_call_bpf+0xc5/0x1c0 perf_call_bpf_enter.isra.0+0x69/0xb0 perf_syscall_enter+0x13e/0x200 syscall_trace_enter+0x188/0x1c0 do_syscall_64+0xb5/0xe0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 ...... ---[ end trace 0000000000000000 ]--- The oops is triggered when: 1) A bpf program uses bpf_probe_read_kernel() to read from the vsyscall page and invokes copy_from_kernel_nofault() which in turn calls __get_user_asm(). 2) Because the vsyscall page address is not readable from kernel space, a page fault exception is triggered accordingly. 3) handle_page_fault() considers the vsyscall page address as a user space address instead of a kernel space address. This results in the fix-up setup by bpf not being applied and a page_fault_oops() is invoked due to SMAP. Considering handle_page_fault() has already considered the vsyscall page address as a userspace address, fix the problem by disallowing vsyscall page read for copy_from_kernel_nofault(). Originally-by: Thomas Gleixner Reported-by: syzbot+72aa0161922eba61b50e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/CAG48ez06TZft=ATH1qh2c5mpS5BT8UakwNkzi6nvK5_djC-4Nw@mail.gmail.com Reported-by: xingwei lee Closes: https://lore.kernel.org/bpf/CABOYnLynjBoFZOf3Z4BhaZkc5hx_kHfsjiW+UWLoB=w33LvScw@mail.gmail.com Signed-off-by: Hou Tao Reviewed-by: Sohil Mehta Acked-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240202103935.3154011-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/x86/mm/maccess.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c index 6993f026adec..42115ac079cf 100644 --- a/arch/x86/mm/maccess.c +++ b/arch/x86/mm/maccess.c @@ -3,6 +3,8 @@ #include #include +#include + #ifdef CONFIG_X86_64 bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { @@ -15,6 +17,14 @@ bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) if (vaddr < TASK_SIZE_MAX + PAGE_SIZE) return false; + /* + * Reading from the vsyscall page may cause an unhandled fault in + * certain cases. Though it is at an address above TASK_SIZE_MAX, it is + * usually considered as a user space address. + */ + if (is_vsyscall_vaddr(vaddr)) + return false; + /* * Allow everything during early boot before 'x86_virt_bits' * is initialized. Needed for instruction decoding in early -- cgit v1.2.3 From 8d3a7dfb801d157ac423261d7cd62c33e95375f8 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 21 Feb 2024 09:27:31 +0000 Subject: KVM: arm64: vgic-its: Test for valid IRQ in its_sync_lpi_pending_table() vgic_get_irq() may not return a valid descriptor if there is no ITS that holds a valid translation for the specified INTID. If that is the case, it is safe to silently ignore it and continue processing the LPI pending table. Cc: stable@vger.kernel.org Fixes: 33d3bc9556a7 ("KVM: arm64: vgic-its: Read initial LPI pending table") Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240221092732.4126848-2-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-its.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index e2764d0ffa9f..082448de27ed 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -468,6 +468,9 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) } irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); + if (!irq) + continue; + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = pendmask & (1U << bit_nr); vgic_queue_irq_unlock(vcpu->kvm, irq, flags); -- cgit v1.2.3 From 85a71ee9a0700f6c18862ef3b0011ed9dad99aca Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 21 Feb 2024 09:27:32 +0000 Subject: KVM: arm64: vgic-its: Test for valid IRQ in MOVALL handler It is possible that an LPI mapped in a different ITS gets unmapped while handling the MOVALL command. If that is the case, there is no state that can be migrated to the destination. Silently ignore it and continue migrating other LPIs. Cc: stable@vger.kernel.org Fixes: ff9c114394aa ("KVM: arm/arm64: GICv4: Handle MOVALL applied to a vPE") Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20240221092732.4126848-3-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-its.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 082448de27ed..28a93074eca1 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -1435,6 +1435,8 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, for (i = 0; i < irq_count; i++) { irq = vgic_get_irq(kvm, NULL, intids[i]); + if (!irq) + continue; update_affinity(irq, vcpu2); -- cgit v1.2.3