From 5844cc25fd121074de7895181a2fa1ce100a0fdd Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 26 Nov 2020 20:25:27 +1000 Subject: powerpc/64s: Fix hash ISA v3.0 TLBIEL instruction generation A typo has the R field of the instruction assigned by lucky dip a la register allocator. Fixes: d4748276ae14c ("powerpc/64s: Improve local TLB flush for boot and MCE on POWER9") Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201126102530.691335-2-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_native.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 0203cdf48c54..97fa42d7027e 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -68,7 +68,7 @@ static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned in rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) - : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r) : "memory"); } -- cgit v1.2.3 From c0b27c517acf8a2b359dd373a7e7e88b01a8308e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 26 Nov 2020 20:25:28 +1000 Subject: powerpc/64s/pseries: Fix hash tlbiel_all_isa300 for guest kernels tlbiel_all() can not be usable in !HVMODE when running hash presently, remove HV privileged flushes when running in guest to make it usable. Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201126102530.691335-3-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_native.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 97fa42d7027e..52e170bd95ae 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -92,16 +92,15 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) asm volatile("ptesync": : :"memory"); /* - * Flush the first set of the TLB, and any caching of partition table - * entries. Then flush the remaining sets of the TLB. Hash mode uses - * partition scoped TLB translations. + * Flush the partition table cache if this is HV mode. */ - tlbiel_hash_set_isa300(0, is, 0, 2, 0); - for (set = 1; set < num_sets; set++) - tlbiel_hash_set_isa300(set, is, 0, 0, 0); + if (early_cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_hash_set_isa300(0, is, 0, 2, 0); /* - * Now invalidate the process table cache. + * Now invalidate the process table cache. UPRT=0 HPT modes (what + * current hardware implements) do not use the process table, but + * add the flushes anyway. * * From ISA v3.0B p. 1078: * The following forms are invalid. @@ -110,6 +109,14 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) */ tlbiel_hash_set_isa300(0, is, 0, 2, 1); + /* + * Then flush the sets of the TLB proper. Hash mode uses + * partition scoped TLB translations, which may be flushed + * in !HV mode. + */ + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa300(set, is, 0, 0, 0); + ppc_after_tlbiel_barrier(); asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); -- cgit v1.2.3 From 01b0f0eae0812e80efeee4ee17687e5386335e08 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 26 Nov 2020 20:25:30 +1000 Subject: powerpc/64s: Trim offlined CPUs from mm_cpumasks When offlining a CPU, powerpc/64s does not flush TLBs, rather it just leaves the CPU set in mm_cpumasks, so it continues to receive TLBIEs to manage its TLBs. However the exit_flush_lazy_tlbs() function expects that after returning, all CPUs (except self) have flushed TLBs for that mm, in which case TLBIEL can be used for this flush. This breaks for offline CPUs because they don't get the IPI to flush their TLB. This can lead to stale translations. Fix this by clearing the CPU from mm_cpumasks, then flushing all TLBs before going offline. These offlined CPU bits stuck in the cpumask also prevents the cpumask from being trimmed back to local mode, which means continual broadcast IPIs or TLBIEs are needed for TLB flushing. This patch prevents that situation too. A cast of many were involved in working this out, but in particular Milton, Aneesh, Paul made key discoveries. Fixes: 0cef77c7798a7 ("powerpc/64s/radix: flush remote CPUs out of single-threaded mm_cpumask") Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Debugged-by: Milton Miller Debugged-by: Aneesh Kumar K.V Debugged-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201126102530.691335-5-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/mmu.h | 12 ++++++++++++ arch/powerpc/mm/book3s64/mmu_context.c | 20 ++++++++++++++++++++ arch/powerpc/platforms/powermac/smp.c | 2 ++ arch/powerpc/platforms/powernv/smp.c | 3 +++ arch/powerpc/platforms/pseries/hotplug-cpu.c | 3 +++ 5 files changed, 40 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index e0b52940e43c..750918451dd2 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -242,6 +242,18 @@ extern void radix_init_pseries(void); static inline void radix_init_pseries(void) { }; #endif +#ifdef CONFIG_HOTPLUG_CPU +#define arch_clear_mm_cpumask_cpu(cpu, mm) \ + do { \ + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) { \ + atomic_dec(&(mm)->context.active_cpus); \ + cpumask_clear_cpu(cpu, mm_cpumask(mm)); \ + } \ + } while (0) + +void cleanup_cpu_mmu_context(void); +#endif + static inline int get_user_context(mm_context_t *ctx, unsigned long ea) { int index = ea >> MAX_EA_BITS_PER_CONTEXT; diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 1c54821de7bf..0c8557220ae2 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -307,3 +308,22 @@ void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) isync(); } #endif + +/** + * cleanup_cpu_mmu_context - Clean up MMU details for this CPU (newly offlined) + * + * This clears the CPU from mm_cpumask for all processes, and then flushes the + * local TLB to ensure TLB coherency in case the CPU is onlined again. + * + * KVM guest translations are not necessarily flushed here. If KVM started + * using mm_cpumask or the Linux APIs which do, this would have to be resolved. + */ +#ifdef CONFIG_HOTPLUG_CPU +void cleanup_cpu_mmu_context(void) +{ + int cpu = smp_processor_id(); + + clear_tasks_mm_cpumask(cpu); + tlbiel_all(); +} +#endif diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index 74ebe664b016..adae2a6712e1 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -911,6 +911,8 @@ static int smp_core99_cpu_disable(void) mpic_cpu_set_priority(0xf); + cleanup_cpu_mmu_context(); + return 0; } diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 54c4ba45c7ce..cbb67813cd5d 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -143,6 +143,9 @@ static int pnv_smp_cpu_disable(void) xive_smp_disable_cpu(); else xics_migrate_irqs_away(); + + cleanup_cpu_mmu_context(); + return 0; } diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index f2837e33bf5d..a02012f1b04a 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -90,6 +90,9 @@ static int pseries_cpu_disable(void) xive_smp_disable_cpu(); else xics_migrate_irqs_away(); + + cleanup_cpu_mmu_context(); + return 0; } -- cgit v1.2.3 From 10f78fd0dabbc3856ddd67b09a46abdedb045913 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 27 Nov 2020 11:07:38 +0530 Subject: powerpc/numa: Fix a regression on memoryless node 0 Commit e75130f20b1f ("powerpc/numa: Offline memoryless cpuless node 0") offlines node 0 and expects nodes to be subsequently onlined when CPUs or nodes are detected. Commit 6398eaa26816 ("powerpc/numa: Prefer node id queried from vphn") skips onlining node 0 when CPUs are associated with node 0. On systems with node 0 having CPUs but no memory, this causes node 0 be marked offline. This causes issues at boot time when trying to set memory node for online CPUs while building the zonelist. 0:mon> t [link register ] c000000000400354 __build_all_zonelists+0x164/0x280 [c00000000161bda0] c0000000016533c8 node_states+0x20/0xa0 (unreliable) [c00000000161bdc0] c000000000400384 __build_all_zonelists+0x194/0x280 [c00000000161be30] c000000001041800 build_all_zonelists_init+0x4c/0x118 [c00000000161be80] c0000000004020d0 build_all_zonelists+0x190/0x1b0 [c00000000161bef0] c000000001003cf8 start_kernel+0x18c/0x6a8 [c00000000161bf90] c00000000000adb4 start_here_common+0x1c/0x3e8 0:mon> r R00 = c000000000400354 R16 = 000000000b57a0e8 R01 = c00000000161bda0 R17 = 000000000b57a6b0 R02 = c00000000161ce00 R18 = 000000000b5afee8 R03 = 0000000000000000 R19 = 000000000b6448a0 R04 = 0000000000000000 R20 = fffffffffffffffd R05 = 0000000000000000 R21 = 0000000001400000 R06 = 0000000000000000 R22 = 000000001ec00000 R07 = 0000000000000001 R23 = c000000001175580 R08 = 0000000000000000 R24 = c000000001651ed8 R09 = c0000000017e84d8 R25 = c000000001652480 R10 = 0000000000000000 R26 = c000000001175584 R11 = c000000c7fac0d10 R27 = c0000000019568d0 R12 = c000000000400180 R28 = 0000000000000000 R13 = c000000002200000 R29 = c00000000164dd78 R14 = 000000000b579f78 R30 = 0000000000000000 R15 = 000000000b57a2b8 R31 = c000000001175584 pc = c000000000400194 local_memory_node+0x24/0x80 cfar= c000000000074334 mcount+0xc/0x10 lr = c000000000400354 __build_all_zonelists+0x164/0x280 msr = 8000000002001033 cr = 44002284 ctr = c000000000400180 xer = 0000000000000001 trap = 380 dar = 0000000000001388 dsisr = c00000000161bc90 0:mon> Fix this by setting node to be online while onlining CPUs that belong to node 0. Fixes: e75130f20b1f ("powerpc/numa: Offline memoryless cpuless node 0") Fixes: 6398eaa26816 ("powerpc/numa: Prefer node id queried from vphn") Reported-by: Milan Mohanty Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201127053738.10085-1-srikar@linux.vnet.ibm.com --- arch/powerpc/mm/numa.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 63f61d8b55e5..f2bf98bdcea2 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -742,8 +742,7 @@ static int __init parse_numa_properties(void) of_node_put(cpu); } - if (likely(nid > 0)) - node_set_online(nid); + node_set_online(nid); } get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); -- cgit v1.2.3 From f54db39fbe40731c40aefdd3bc26e7d56d668c64 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 30 Nov 2020 13:19:27 +0100 Subject: KVM: PPC: Book3S HV: XIVE: Fix vCPU id sanity check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 062cfab7069f ("KVM: PPC: Book3S HV: XIVE: Make VP block size configurable") updated kvmppc_xive_vcpu_id_valid() in a way that allows userspace to trigger an assertion in skiboot and crash the host: [ 696.186248988,3] XIVE[ IC 08 ] eq_blk != vp_blk (0 vs. 1) for target 0x4300008c/0 [ 696.186314757,0] Assert fail: hw/xive.c:2370:0 [ 696.186342458,0] Aborting! xive-kvCPU 0043 Backtrace: S: 0000000031e2b8f0 R: 0000000030013840 .backtrace+0x48 S: 0000000031e2b990 R: 000000003001b2d0 ._abort+0x4c S: 0000000031e2ba10 R: 000000003001b34c .assert_fail+0x34 S: 0000000031e2ba90 R: 0000000030058984 .xive_eq_for_target.part.20+0xb0 S: 0000000031e2bb40 R: 0000000030059fdc .xive_setup_silent_gather+0x2c S: 0000000031e2bc20 R: 000000003005a334 .opal_xive_set_vp_info+0x124 S: 0000000031e2bd20 R: 00000000300051a4 opal_entry+0x134 --- OPAL call token: 0x8a caller R1: 0xc000001f28563850 --- XIVE maintains the interrupt context state of non-dispatched vCPUs in an internal VP structure. We allocate a bunch of those on startup to accommodate all possible vCPUs. Each VP has an id, that we derive from the vCPU id for efficiency: static inline u32 kvmppc_xive_vp(struct kvmppc_xive *xive, u32 server) { return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server); } The KVM XIVE device used to allocate KVM_MAX_VCPUS VPs. This was limitting the number of concurrent VMs because the VP space is limited on the HW. Since most of the time, VMs run with a lot less vCPUs, commit 062cfab7069f ("KVM: PPC: Book3S HV: XIVE: Make VP block size configurable") gave the possibility for userspace to tune the size of the VP block through the KVM_DEV_XIVE_NR_SERVERS attribute. The check in kvmppc_pack_vcpu_id() was changed from cpu < KVM_MAX_VCPUS * xive->kvm->arch.emul_smt_mode to cpu < xive->nr_servers * xive->kvm->arch.emul_smt_mode The previous check was based on the fact that the VP block had KVM_MAX_VCPUS entries and that kvmppc_pack_vcpu_id() guarantees that packed vCPU ids are below KVM_MAX_VCPUS. We've changed the size of the VP block, but kvmppc_pack_vcpu_id() has nothing to do with it and it certainly doesn't ensure that the packed vCPU ids are below xive->nr_servers. kvmppc_xive_vcpu_id_valid() might thus return true when the VM was configured with a non-standard VSMT mode, even if the packed vCPU id is higher than what we expect. We end up using an unallocated VP id, which confuses OPAL. The assert in OPAL is probably abusive and should be converted to a regular error that the kernel can handle, but we shouldn't really use broken VP ids in the first place. Fix kvmppc_xive_vcpu_id_valid() so that it checks the packed vCPU id is below xive->nr_servers, which is explicitly what we want. Fixes: 062cfab7069f ("KVM: PPC: Book3S HV: XIVE: Make VP block size configurable") Cc: stable@vger.kernel.org # v5.5+ Signed-off-by: Greg Kurz Reviewed-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/160673876747.695514.1809676603724514920.stgit@bahia.lan --- arch/powerpc/kvm/book3s_xive.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 85215e79db42..a0ebc29f30b2 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1214,12 +1214,9 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu) { /* We have a block of xive->nr_servers VPs. We just need to check - * raw vCPU ids are below the expected limit for this guest's - * core stride ; kvmppc_pack_vcpu_id() will pack them down to an - * index that can be safely used to compute a VP id that belongs - * to the VP block. + * packed vCPU ids are below that. */ - return cpu < xive->nr_servers * xive->kvm->arch.emul_smt_mode; + return kvmppc_pack_vcpu_id(xive->kvm, cpu) < xive->nr_servers; } int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp) -- cgit v1.2.3 From a1ee28117077c3bf24e5ab6324c835eaab629c45 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 28 Nov 2020 17:07:21 +1000 Subject: powerpc/64s/powernv: Fix memory corruption when saving SLB entries on MCE This can be hit by an HPT guest running on an HPT host and bring down the host, so it's quite important to fix. Fixes: 7290f3b3d3e6 ("powerpc/64s/powernv: machine check dump SLB contents") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Nicholas Piggin Acked-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201128070728.825934-2-npiggin@gmail.com --- arch/powerpc/platforms/powernv/setup.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 46115231a3b2..4426a109ec2f 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -211,11 +211,16 @@ static void __init pnv_init(void) add_preferred_console("hvc", 0, NULL); if (!radix_enabled()) { + size_t size = sizeof(struct slb_entry) * mmu_slb_size; int i; /* Allocate per cpu area to save old slb contents during MCE */ - for_each_possible_cpu(i) - paca_ptrs[i]->mce_faulty_slbs = memblock_alloc_node(mmu_slb_size, __alignof__(*paca_ptrs[i]->mce_faulty_slbs), cpu_to_node(i)); + for_each_possible_cpu(i) { + paca_ptrs[i]->mce_faulty_slbs = + memblock_alloc_node(size, + __alignof__(struct slb_entry), + cpu_to_node(i)); + } } } -- cgit v1.2.3