From dbf563eee0b8cc056744514d91c5ffc2fa6c0982 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 26 Oct 2020 07:52:52 -0700 Subject: x86/hyperv: Clarify comment on x2apic mode The comment about Hyper-V accessors is unclear regarding their potential use in x2apic mode, as is the associated commit message in e211288b72f1. Clarify that while the architectural and synthetic MSRs are equivalent in x2apic mode, the full set of xapic accessors cannot be used because of register layout differences. Fixes: e211288b72f1 ("x86/hyperv: Make vapic support x2apic mode") Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/1603723972-81303-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_apic.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 40e0e322161d..284e73661a18 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -273,11 +273,15 @@ void __init hv_apic_init(void) pr_info("Hyper-V: Using enlightened APIC (%s mode)", x2apic_enabled() ? "x2apic" : "xapic"); /* - * With x2apic, architectural x2apic MSRs are equivalent to the - * respective synthetic MSRs, so there's no need to override - * the apic accessors. The only exception is - * hv_apic_eoi_write, because it benefits from lazy EOI when - * available, but it works for both xapic and x2apic modes. + * When in x2apic mode, don't use the Hyper-V specific APIC + * accessors since the field layout in the ICR register is + * different in x2apic mode. Furthermore, the architectural + * x2apic MSRs function just as well as the Hyper-V + * synthetic APIC MSRs, so there's no benefit in having + * separate Hyper-V accessors for x2apic mode. The only + * exception is hv_apic_eoi_write, because it benefits from + * lazy EOI when available, but the same accessor works for + * both xapic and x2apic because the field layout is the same. */ apic_set_eoi_write(hv_apic_eoi_write); if (!x2apic_enabled()) { -- cgit v1.2.3 From 3ad84246a4097010f3ae3d6944120c0be00e9e7a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 28 Oct 2020 17:46:55 +0100 Subject: x86/boot/compressed/64: Introduce sev_status Introduce sev_status and initialize it together with sme_me_mask to have an indicator which SEV features are enabled. Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Reviewed-by: Tom Lendacky Link: https://lkml.kernel.org/r/20201028164659.27002-2-joro@8bytes.org --- arch/x86/boot/compressed/mem_encrypt.S | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index dd07e7b41b11..3092ae173f94 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -81,6 +81,19 @@ SYM_FUNC_START(set_sev_encryption_mask) bts %rax, sme_me_mask(%rip) /* Create the encryption mask */ + /* + * Read MSR_AMD64_SEV again and store it to sev_status. Can't do this in + * get_sev_encryption_bit() because this function is 32-bit code and + * shared between 64-bit and 32-bit boot path. + */ + movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */ + rdmsr + + /* Store MSR value in sev_status */ + shlq $32, %rdx + orq %rdx, %rax + movq %rax, sev_status(%rip) + .Lno_sev_mask: movq %rbp, %rsp /* Restore original stack pointer */ @@ -96,5 +109,6 @@ SYM_FUNC_END(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT .balign 8 -SYM_DATA(sme_me_mask, .quad 0) +SYM_DATA(sme_me_mask, .quad 0) +SYM_DATA(sev_status, .quad 0) #endif -- cgit v1.2.3 From ed7b895f3efb5df184722f5a30f8164fcaffceb1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 28 Oct 2020 17:46:56 +0100 Subject: x86/boot/compressed/64: Sanity-check CPUID results in the early #VC handler The early #VC handler which doesn't have a GHCB can only handle CPUID exit codes. It is needed by the early boot code to handle #VC exceptions raised in verify_cpu() and to get the position of the C-bit. But the CPUID information comes from the hypervisor which is untrusted and might return results which trick the guest into the no-SEV boot path with no C-bit set in the page-tables. All data written to memory would then be unencrypted and could leak sensitive data to the hypervisor. Add sanity checks to the early #VC handler to make sure the hypervisor can not pretend that SEV is disabled. [ bp: Massage a bit. ] Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Reviewed-by: Tom Lendacky Link: https://lkml.kernel.org/r/20201028164659.27002-3-joro@8bytes.org --- arch/x86/kernel/sev-es-shared.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c index 5f83ccaab877..7d04b356d44d 100644 --- a/arch/x86/kernel/sev-es-shared.c +++ b/arch/x86/kernel/sev-es-shared.c @@ -178,6 +178,32 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) goto fail; regs->dx = val >> 32; + /* + * This is a VC handler and the #VC is only raised when SEV-ES is + * active, which means SEV must be active too. Do sanity checks on the + * CPUID results to make sure the hypervisor does not trick the kernel + * into the no-sev path. This could map sensitive data unencrypted and + * make it accessible to the hypervisor. + * + * In particular, check for: + * - Hypervisor CPUID bit + * - Availability of CPUID leaf 0x8000001f + * - SEV CPUID bit. + * + * The hypervisor might still report the wrong C-bit position, but this + * can't be checked here. + */ + + if ((fn == 1 && !(regs->cx & BIT(31)))) + /* Hypervisor bit */ + goto fail; + else if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + /* SEV leaf check */ + goto fail; + else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) + /* SEV bit */ + goto fail; + /* Skip over the CPUID two-byte opcode */ regs->ip += 2; -- cgit v1.2.3 From 86ce43f7dde81562f58b24b426cef068bd9f7595 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 28 Oct 2020 17:46:57 +0100 Subject: x86/boot/compressed/64: Check SEV encryption in 64-bit boot-path Check whether the hypervisor reported the correct C-bit when running as an SEV guest. Using a wrong C-bit position could be used to leak sensitive data from the guest to the hypervisor. The check function is in a separate file: arch/x86/kernel/sev_verify_cbit.S so that it can be re-used in the running kernel image. [ bp: Massage. ] Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Reviewed-by: Tom Lendacky Link: https://lkml.kernel.org/r/20201028164659.27002-4-joro@8bytes.org --- arch/x86/boot/compressed/ident_map_64.c | 1 + arch/x86/boot/compressed/mem_encrypt.S | 4 ++ arch/x86/boot/compressed/misc.h | 2 + arch/x86/kernel/sev_verify_cbit.S | 89 +++++++++++++++++++++++++++++++++ 4 files changed, 96 insertions(+) create mode 100644 arch/x86/kernel/sev_verify_cbit.S (limited to 'arch') diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index a5e5db6ada3c..39b2eded7bc2 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -164,6 +164,7 @@ void initialize_identity_maps(void *rmode) add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); /* Load the new page-table. */ + sev_verify_cbit(top_level_pgt); write_cr3(top_level_pgt); } diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index 3092ae173f94..aa561795efd1 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -68,6 +68,9 @@ SYM_FUNC_START(get_sev_encryption_bit) SYM_FUNC_END(get_sev_encryption_bit) .code64 + +#include "../../kernel/sev_verify_cbit.S" + SYM_FUNC_START(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT push %rbp @@ -111,4 +114,5 @@ SYM_FUNC_END(set_sev_encryption_mask) .balign 8 SYM_DATA(sme_me_mask, .quad 0) SYM_DATA(sev_status, .quad 0) +SYM_DATA(sev_check_data, .quad 0) #endif diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 6d31f1b4c4d1..d9a631c5973c 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -159,4 +159,6 @@ void boot_page_fault(void); void boot_stage1_vc(void); void boot_stage2_vc(void); +unsigned long sev_verify_cbit(unsigned long cr3); + #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S new file mode 100644 index 000000000000..ee04941a6546 --- /dev/null +++ b/arch/x86/kernel/sev_verify_cbit.S @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sev_verify_cbit.S - Code for verification of the C-bit position reported + * by the Hypervisor when running with SEV enabled. + * + * Copyright (c) 2020 Joerg Roedel (jroedel@suse.de) + * + * sev_verify_cbit() is called before switching to a new long-mode page-table + * at boot. + * + * Verify that the C-bit position is correct by writing a random value to + * an encrypted memory location while on the current page-table. Then it + * switches to the new page-table to verify the memory content is still the + * same. After that it switches back to the current page-table and when the + * check succeeded it returns. If the check failed the code invalidates the + * stack pointer and goes into a hlt loop. The stack-pointer is invalidated to + * make sure no interrupt or exception can get the CPU out of the hlt loop. + * + * New page-table pointer is expected in %rdi (first parameter) + * + */ +SYM_FUNC_START(sev_verify_cbit) +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* First check if a C-bit was detected */ + movq sme_me_mask(%rip), %rsi + testq %rsi, %rsi + jz 3f + + /* sme_me_mask != 0 could mean SME or SEV - Check also for SEV */ + movq sev_status(%rip), %rsi + testq %rsi, %rsi + jz 3f + + /* Save CR4 in %rsi */ + movq %cr4, %rsi + + /* Disable Global Pages */ + movq %rsi, %rdx + andq $(~X86_CR4_PGE), %rdx + movq %rdx, %cr4 + + /* + * Verified that running under SEV - now get a random value using + * RDRAND. This instruction is mandatory when running as an SEV guest. + * + * Don't bail out of the loop if RDRAND returns errors. It is better to + * prevent forward progress than to work with a non-random value here. + */ +1: rdrand %rdx + jnc 1b + + /* Store value to memory and keep it in %rdx */ + movq %rdx, sev_check_data(%rip) + + /* Backup current %cr3 value to restore it later */ + movq %cr3, %rcx + + /* Switch to new %cr3 - This might unmap the stack */ + movq %rdi, %cr3 + + /* + * Compare value in %rdx with memory location. If C-bit is incorrect + * this would read the encrypted data and make the check fail. + */ + cmpq %rdx, sev_check_data(%rip) + + /* Restore old %cr3 */ + movq %rcx, %cr3 + + /* Restore previous CR4 */ + movq %rsi, %cr4 + + /* Check CMPQ result */ + je 3f + + /* + * The check failed, prevent any forward progress to prevent ROP + * attacks, invalidate the stack and go into a hlt loop. + */ + xorq %rsp, %rsp + subq $0x1000, %rsp +2: hlt + jmp 2b +3: +#endif + /* Return page-table pointer */ + movq %rdi, %rax + ret +SYM_FUNC_END(sev_verify_cbit) -- cgit v1.2.3 From c9f09539e16e281f92a27760fdfae71e8af036f6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 28 Oct 2020 17:46:58 +0100 Subject: x86/head/64: Check SEV encryption before switching to kernel page-table When SEV is enabled, the kernel requests the C-bit position again from the hypervisor to build its own page-table. Since the hypervisor is an untrusted source, the C-bit position needs to be verified before the kernel page-table is used. Call sev_verify_cbit() before writing the CR3. [ bp: Massage. ] Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Reviewed-by: Tom Lendacky Link: https://lkml.kernel.org/r/20201028164659.27002-5-joro@8bytes.org --- arch/x86/kernel/head_64.S | 16 ++++++++++++++++ arch/x86/mm/mem_encrypt.c | 1 + 2 files changed, 17 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 7eb2a1c87969..3c417734790f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -161,6 +161,21 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) /* Setup early boot stage 4-/5-level pagetables. */ addq phys_base(%rip), %rax + + /* + * For SEV guests: Verify that the C-bit is correct. A malicious + * hypervisor could lie about the C-bit position to perform a ROP + * attack on the guest by writing to the unencrypted stack and wait for + * the next RET instruction. + * %rsi carries pointer to realmode data and is callee-clobbered. Save + * and restore it. + */ + pushq %rsi + movq %rax, %rdi + call sev_verify_cbit + popq %rsi + + /* Switch to new page-table */ movq %rax, %cr3 /* Ensure I am executing from virtual addresses */ @@ -279,6 +294,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" +#include "sev_verify_cbit.S" #ifdef CONFIG_HOTPLUG_CPU /* diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index efbb3de472df..bc0833713be9 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -39,6 +39,7 @@ */ u64 sme_me_mask __section(".data") = 0; u64 sev_status __section(".data") = 0; +u64 sev_check_data __section(".data") = 0; EXPORT_SYMBOL(sme_me_mask); DEFINE_STATIC_KEY_FALSE(sev_enable_key); EXPORT_SYMBOL_GPL(sev_enable_key); -- cgit v1.2.3 From 2411cd82112397bfb9d8f0f19cd46c3d71e0ce67 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 28 Oct 2020 17:46:59 +0100 Subject: x86/sev-es: Do not support MMIO to/from encrypted memory MMIO memory is usually not mapped encrypted, so there is no reason to support emulated MMIO when it is mapped encrypted. Prevent a possible hypervisor attack where a RAM page is mapped as an MMIO page in the nested page-table, so that any guest access to it will trigger a #VC exception and leak the data on that page to the hypervisor via the GHCB (like with valid MMIO). On the read side this attack would allow the HV to inject data into the guest. Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Reviewed-by: Tom Lendacky Link: https://lkml.kernel.org/r/20201028164659.27002-6-joro@8bytes.org --- arch/x86/kernel/sev-es.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 4a96726fbaf8..0bd1a0fc587e 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -374,8 +374,8 @@ fault: return ES_EXCEPTION; } -static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned long vaddr, phys_addr_t *paddr) +static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned long vaddr, phys_addr_t *paddr) { unsigned long va = (unsigned long)vaddr; unsigned int level; @@ -394,15 +394,19 @@ static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, if (user_mode(ctxt->regs)) ctxt->fi.error_code |= X86_PF_USER; - return false; + return ES_EXCEPTION; } + if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) + /* Emulated MMIO to/from encrypted memory not supported */ + return ES_UNSUPPORTED; + pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; pa |= va & ~page_level_mask(level); *paddr = pa; - return true; + return ES_OK; } /* Include code shared with pre-decompression boot stage */ @@ -731,6 +735,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, { u64 exit_code, exit_info_1, exit_info_2; unsigned long ghcb_pa = __pa(ghcb); + enum es_result res; phys_addr_t paddr; void __user *ref; @@ -740,11 +745,12 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; - if (!vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr)) { - if (!read) + res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); + if (res != ES_OK) { + if (res == ES_EXCEPTION && !read) ctxt->fi.error_code |= X86_PF_WRITE; - return ES_EXCEPTION; + return res; } exit_info_1 = paddr; -- cgit v1.2.3 From fd552e0542b4532483289cce48fdbd27b692984b Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 28 Oct 2020 11:27:17 -0400 Subject: powerpc/eeh_cache: Fix a possible debugfs deadlock Lockdep complains that a possible deadlock below in eeh_addr_cache_show() because it is acquiring a lock with IRQ enabled, but eeh_addr_cache_insert_dev() needs to acquire the same lock with IRQ disabled. Let's just make eeh_addr_cache_show() acquire the lock with IRQ disabled as well. CPU0 CPU1 ---- ---- lock(&pci_io_addr_cache_root.piar_lock); local_irq_disable(); lock(&tp->lock); lock(&pci_io_addr_cache_root.piar_lock); lock(&tp->lock); *** DEADLOCK *** lock_acquire+0x140/0x5f0 _raw_spin_lock_irqsave+0x64/0xb0 eeh_addr_cache_insert_dev+0x48/0x390 eeh_probe_device+0xb8/0x1a0 pnv_pcibios_bus_add_device+0x3c/0x80 pcibios_bus_add_device+0x118/0x290 pci_bus_add_device+0x28/0xe0 pci_bus_add_devices+0x54/0xb0 pcibios_init+0xc4/0x124 do_one_initcall+0xac/0x528 kernel_init_freeable+0x35c/0x3fc kernel_init+0x24/0x148 ret_from_kernel_thread+0x5c/0x80 lock_acquire+0x140/0x5f0 _raw_spin_lock+0x4c/0x70 eeh_addr_cache_show+0x38/0x110 seq_read+0x1a0/0x660 vfs_read+0xc8/0x1f0 ksys_read+0x74/0x130 system_call_exception+0xf8/0x1d0 system_call_common+0xe8/0x218 Fixes: 5ca85ae6318d ("powerpc/eeh_cache: Add a way to dump the EEH address cache") Signed-off-by: Qian Cai Reviewed-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201028152717.8967-1-cai@redhat.com --- arch/powerpc/kernel/eeh_cache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 6b50bf15d8c1..bf3270426d82 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -264,8 +264,9 @@ static int eeh_addr_cache_show(struct seq_file *s, void *v) { struct pci_io_addr_range *piar; struct rb_node *n; + unsigned long flags; - spin_lock(&pci_io_addr_cache_root.piar_lock); + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); for (n = rb_first(&pci_io_addr_cache_root.rb_root); n; n = rb_next(n)) { piar = rb_entry(n, struct pci_io_addr_range, rb_node); @@ -273,7 +274,7 @@ static int eeh_addr_cache_show(struct seq_file *s, void *v) (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev)); } - spin_unlock(&pci_io_addr_cache_root.piar_lock); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); return 0; } -- cgit v1.2.3 From 99f070b62322a4b8c1252952735806d09eb44b68 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 28 Oct 2020 14:23:34 -0400 Subject: powerpc/smp: Call rcu_cpu_starting() earlier The call to rcu_cpu_starting() in start_secondary() is not early enough in the CPU-hotplug onlining process, which results in lockdep splats as follows (with CONFIG_PROVE_RCU_LIST=y): WARNING: suspicious RCU usage ----------------------------- kernel/locking/lockdep.c:3497 RCU-list traversed in non-reader section!! other info that might help us debug this: RCU used illegally from offline CPU! rcu_scheduler_active = 1, debug_locks = 1 no locks held by swapper/1/0. Call Trace: dump_stack+0xec/0x144 (unreliable) lockdep_rcu_suspicious+0x128/0x14c __lock_acquire+0x1060/0x1c60 lock_acquire+0x140/0x5f0 _raw_spin_lock_irqsave+0x64/0xb0 clockevents_register_device+0x74/0x270 register_decrementer_clockevent+0x94/0x110 start_secondary+0x134/0x800 start_secondary_prolog+0x10/0x14 This is avoided by adding a call to rcu_cpu_starting() near the beginning of the start_secondary() function. Note that the raw_smp_processor_id() is required in order to avoid calling into lockdep before RCU has declared the CPU to be watched for readers. It's safe to call rcu_cpu_starting() in the arch code as well as later in generic code, as explained by Paul: It uses a per-CPU variable so that RCU pays attention only to the first call to rcu_cpu_starting() if there is more than one of them. This is even intentional, due to there being a generic arch-independent call to rcu_cpu_starting() in notify_cpu_starting(). So multiple calls to rcu_cpu_starting() are fine by design. Fixes: 4d004099a668 ("lockdep: Fix lockdep recursion") Signed-off-by: Qian Cai Acked-by: Paul E. McKenney [mpe: Add Fixes tag, reword slightly & expand change log] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201028182334.13466-1-cai@redhat.com --- arch/powerpc/kernel/smp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 3c6b9822f978..8c2857cbd960 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1393,13 +1393,14 @@ static void add_cpu_to_masks(int cpu) /* Activate a secondary processor. */ void start_secondary(void *unused) { - unsigned int cpu = smp_processor_id(); + unsigned int cpu = raw_smp_processor_id(); mmgrab(&init_mm); current->active_mm = &init_mm; smp_store_cpu_info(cpu); set_dec(tb_ticks_per_jiffy); + rcu_cpu_starting(cpu); preempt_disable(); cpu_callin_map[cpu] = 1; -- cgit v1.2.3 From 328d2168ca524d501fc4b133d6be076142bd305c Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 27 Oct 2020 15:01:17 -0700 Subject: ARC: stack unwinding: avoid indefinite looping Currently stack unwinder is a while(1) loop which relies on the dwarf unwinder to signal termination, which in turn relies on dwarf info to do so. This in theory could cause an infinite loop if the dwarf info was somehow messed up or the register contents were etc. This fix thus detects the excessive looping and breaks the loop. | Mem: 26184K used, 1009136K free, 0K shrd, 0K buff, 14416K cached | CPU: 0.0% usr 72.8% sys 0.0% nic 27.1% idle 0.0% io 0.0% irq 0.0% sirq | Load average: 4.33 2.60 1.11 2/74 139 | PID PPID USER STAT VSZ %VSZ CPU %CPU COMMAND | 133 2 root SWN 0 0.0 3 22.9 [rcu_torture_rea] | 132 2 root SWN 0 0.0 0 22.0 [rcu_torture_rea] | 131 2 root SWN 0 0.0 3 21.5 [rcu_torture_rea] | 126 2 root RW 0 0.0 2 5.4 [rcu_torture_wri] | 129 2 root SWN 0 0.0 0 0.2 [rcu_torture_fak] | 137 2 root SW 0 0.0 0 0.2 [rcu_torture_cbf] | 127 2 root SWN 0 0.0 0 0.1 [rcu_torture_fak] | 138 115 root R 1464 0.1 2 0.1 top | 130 2 root SWN 0 0.0 0 0.1 [rcu_torture_fak] | 128 2 root SWN 0 0.0 0 0.1 [rcu_torture_fak] | 115 1 root S 1472 0.1 1 0.0 -/bin/sh | 104 1 root S 1464 0.1 0 0.0 inetd | 1 0 root S 1456 0.1 2 0.0 init | 78 1 root S 1456 0.1 0 0.0 syslogd -O /var/log/messages | 134 2 root SW 0 0.0 2 0.0 [rcu_torture_sta] | 10 2 root IW 0 0.0 1 0.0 [rcu_preempt] | 88 2 root IW 0 0.0 1 0.0 [kworker/1:1-eve] | 66 2 root IW 0 0.0 2 0.0 [kworker/2:2-eve] | 39 2 root IW 0 0.0 2 0.0 [kworker/2:1-eve] | unwinder looping too long, aborting ! Cc: Signed-off-by: Vineet Gupta --- arch/arc/kernel/stacktrace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index feba91c9d969..b23986f98450 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -112,7 +112,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs, int (*consumer_fn) (unsigned int, void *), void *arg) { #ifdef CONFIG_ARC_DW2_UNWIND - int ret = 0; + int ret = 0, cnt = 0; unsigned int address; struct unwind_frame_info frame_info; @@ -132,6 +132,11 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs, break; frame_info.regs.r63 = frame_info.regs.r31; + + if (cnt++ > 128) { + printk("unwinder looping too long, aborting !\n"); + return 0; + } } return address; /* return the last address it saw */ -- cgit v1.2.3 From 3b57533b460c8dc22a432684b7e8d22571f34d2e Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 29 Oct 2020 19:18:34 -0700 Subject: ARC: [plat-hsdk] Remap CCMs super early in asm boot trampoline ARC HSDK platform stopped booting on released v5.10-rc1, getting stuck in startup of non master SMP cores. This was bisected to upstream commit 7fef431be9c9ac25 "(mm/page_alloc: place pages to tail in __free_pages_core())" That commit itself is harmless, it just exposed a subtle assumption in our platform code (hence CC'ing linux-mm just as FYI in case some other arches / platforms trip on it). The upstream commit is semantically disruptive as it reverses the order of page allocations (actually it can be good test for hardware verification to exercise different memory patterns altogether). For ARC HSDK platform that meant a remapped memory region (pertaining to unused Closely Coupled Memory) started getting used early for dynamice allocations, while not effectively remapped on all the cores, triggering memory error exception on those cores. The fix is to move the CCM remapping from early platform code to to early core boot code. And while it is undesirable to riddle common boot code with platform quirks, there is no other way to do this since the faltering code involves setting up stack itself so even function calls are not allowed at that point. If anyone is interested, all the gory details can be found at Link below. Link: https://github.com/foss-for-synopsys-dwc-arc-processors/linux/issues/32 Cc: David Hildenbrand Cc: linux-mm@kvack.org Signed-off-by: Vineet Gupta --- arch/arc/kernel/head.S | 17 ++++++++++++++++- arch/arc/plat-hsdk/platform.c | 17 ----------------- 2 files changed, 16 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/arc/kernel/head.S b/arch/arc/kernel/head.S index 17fd1ed700cc..9152782444b5 100644 --- a/arch/arc/kernel/head.S +++ b/arch/arc/kernel/head.S @@ -67,7 +67,22 @@ sr r5, [ARC_REG_LPB_CTRL] 1: #endif /* CONFIG_ARC_LPB_DISABLE */ -#endif + + /* On HSDK, CCMs need to remapped super early */ +#ifdef CONFIG_ARC_SOC_HSDK + mov r6, 0x60000000 + lr r5, [ARC_REG_ICCM_BUILD] + breq r5, 0, 1f + sr r6, [ARC_REG_AUX_ICCM] +1: + lr r5, [ARC_REG_DCCM_BUILD] + breq r5, 0, 2f + sr r6, [ARC_REG_AUX_DCCM] +2: +#endif /* CONFIG_ARC_SOC_HSDK */ + +#endif /* CONFIG_ISA_ARCV2 */ + ; Config DSP_CTRL properly, so kernel may use integer multiply, ; multiply-accumulate, and divide operations DSP_EARLY_INIT diff --git a/arch/arc/plat-hsdk/platform.c b/arch/arc/plat-hsdk/platform.c index 0b63fc095b99..b3ea1fa11f87 100644 --- a/arch/arc/plat-hsdk/platform.c +++ b/arch/arc/plat-hsdk/platform.c @@ -17,22 +17,6 @@ int arc_hsdk_axi_dmac_coherent __section(".data") = 0; #define ARC_CCM_UNUSED_ADDR 0x60000000 -static void __init hsdk_init_per_cpu(unsigned int cpu) -{ - /* - * By default ICCM is mapped to 0x7z while this area is used for - * kernel virtual mappings, so move it to currently unused area. - */ - if (cpuinfo_arc700[cpu].iccm.sz) - write_aux_reg(ARC_REG_AUX_ICCM, ARC_CCM_UNUSED_ADDR); - - /* - * By default DCCM is mapped to 0x8z while this area is used by kernel, - * so move it to currently unused area. - */ - if (cpuinfo_arc700[cpu].dccm.sz) - write_aux_reg(ARC_REG_AUX_DCCM, ARC_CCM_UNUSED_ADDR); -} #define ARC_PERIPHERAL_BASE 0xf0000000 #define CREG_BASE (ARC_PERIPHERAL_BASE + 0x1000) @@ -339,5 +323,4 @@ static const char *hsdk_compat[] __initconst = { MACHINE_START(SIMULATION, "hsdk") .dt_compat = hsdk_compat, .init_early = hsdk_init_early, - .init_per_cpu = hsdk_init_per_cpu, MACHINE_END -- cgit v1.2.3 From 2a13c13b39a8aea4c69a31549e4cb0094f30103b Mon Sep 17 00:00:00 2001 From: Vanshidhar Konda Date: Fri, 30 Oct 2020 10:30:50 -0700 Subject: arm64: NUMA: Kconfig: Increase NODES_SHIFT to 4 The current arm64 default config limits max NUMA nodes available on system to 4 (NODES_SHIFT = 2). Today's arm64 systems can reach or exceed 16 NUMA nodes. To accomodate current hardware and to fit NODES_SHIFT within page flags on arm64, increase NODES_SHIFT to 4. Signed-off-by: Vanshidhar Konda Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20201020173409.1266576-1-vanshikonda@os.amperecomputing.com/ Link: https://lore.kernel.org/r/20201030173050.1182876-1-vanshikonda@os.amperecomputing.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1d466addb078..1515f6f153a0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1002,7 +1002,7 @@ config NUMA config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" range 1 10 - default "2" + default "4" depends on NEED_MULTIPLE_NODES help Specify the maximum number of NUMA Nodes available on the target -- cgit v1.2.3 From 7ee31a3aa8f490c6507bc4294df6b70bed1c593e Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 3 Nov 2020 14:49:01 +0100 Subject: arm64: kprobes: Use BRK instead of single-step when executing instructions out-of-line Commit 36dadef23fcc ("kprobes: Init kprobes in early_initcall") enabled using kprobes from early_initcall. Unfortunately at this point the hardware debug infrastructure is not operational. The OS lock may still be locked, and the hardware watchpoints may have unknown values when kprobe enables debug monitors to single-step instructions. Rather than using hardware single-step, append a BRK instruction after the instruction to be executed out-of-line. Fixes: 36dadef23fcc ("kprobes: Init kprobes in early_initcall") Suggested-by: Will Deacon Signed-off-by: Jean-Philippe Brucker Acked-by: Masami Hiramatsu Link: https://lore.kernel.org/r/20201103134900.337243-1-jean-philippe@linaro.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/brk-imm.h | 2 + arch/arm64/include/asm/debug-monitors.h | 1 + arch/arm64/include/asm/kprobes.h | 2 +- arch/arm64/kernel/probes/kprobes.c | 69 +++++++++++---------------------- 4 files changed, 27 insertions(+), 47 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h index e3d47b52161d..ec7720dbe2c8 100644 --- a/arch/arm64/include/asm/brk-imm.h +++ b/arch/arm64/include/asm/brk-imm.h @@ -10,6 +10,7 @@ * #imm16 values used for BRK instruction generation * 0x004: for installing kprobes * 0x005: for installing uprobes + * 0x006: for kprobe software single-step * Allowed values for kgdb are 0x400 - 0x7ff * 0x100: for triggering a fault on purpose (reserved) * 0x400: for dynamic BRK instruction @@ -19,6 +20,7 @@ */ #define KPROBES_BRK_IMM 0x004 #define UPROBES_BRK_IMM 0x005 +#define KPROBES_BRK_SS_IMM 0x006 #define FAULT_BRK_IMM 0x100 #define KGDB_DYN_DBG_BRK_IMM 0x400 #define KGDB_COMPILED_DBG_BRK_IMM 0x401 diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index 0b298f48f5bf..657c921fd784 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -53,6 +53,7 @@ /* kprobes BRK opcodes with ESR encoding */ #define BRK64_OPCODE_KPROBES (AARCH64_BREAK_MON | (KPROBES_BRK_IMM << 5)) +#define BRK64_OPCODE_KPROBES_SS (AARCH64_BREAK_MON | (KPROBES_BRK_SS_IMM << 5)) /* uprobes BRK opcodes with ESR encoding */ #define BRK64_OPCODE_UPROBES (AARCH64_BREAK_MON | (UPROBES_BRK_IMM << 5)) diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h index 97e511d645a2..8699ce30f587 100644 --- a/arch/arm64/include/asm/kprobes.h +++ b/arch/arm64/include/asm/kprobes.h @@ -16,7 +16,7 @@ #include #define __ARCH_WANT_KPROBES_INSN_SLOT -#define MAX_INSN_SIZE 1 +#define MAX_INSN_SIZE 2 #define flush_insn_slot(p) do { } while (0) #define kretprobe_blacklist_size 0 diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index deba738142ed..f11a1a1f7026 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -36,25 +36,16 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); static void __kprobes post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *); -static int __kprobes patch_text(kprobe_opcode_t *addr, u32 opcode) -{ - void *addrs[1]; - u32 insns[1]; - - addrs[0] = addr; - insns[0] = opcode; - - return aarch64_insn_patch_text(addrs, insns, 1); -} - static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { + kprobe_opcode_t *addr = p->ainsn.api.insn; + void *addrs[] = {addr, addr + 1}; + u32 insns[] = {p->opcode, BRK64_OPCODE_KPROBES_SS}; + /* prepare insn slot */ - patch_text(p->ainsn.api.insn, p->opcode); + aarch64_insn_patch_text(addrs, insns, 2); - flush_icache_range((uintptr_t) (p->ainsn.api.insn), - (uintptr_t) (p->ainsn.api.insn) + - MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + flush_icache_range((uintptr_t)addr, (uintptr_t)(addr + MAX_INSN_SIZE)); /* * Needs restoring of return address after stepping xol. @@ -128,13 +119,18 @@ void *alloc_insn_page(void) /* arm kprobe: install breakpoint in text */ void __kprobes arch_arm_kprobe(struct kprobe *p) { - patch_text(p->addr, BRK64_OPCODE_KPROBES); + void *addr = p->addr; + u32 insn = BRK64_OPCODE_KPROBES; + + aarch64_insn_patch_text(&addr, &insn, 1); } /* disarm kprobe: remove breakpoint from text */ void __kprobes arch_disarm_kprobe(struct kprobe *p) { - patch_text(p->addr, p->opcode); + void *addr = p->addr; + + aarch64_insn_patch_text(&addr, &p->opcode, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) @@ -163,20 +159,15 @@ static void __kprobes set_current_kprobe(struct kprobe *p) } /* - * Interrupts need to be disabled before single-step mode is set, and not - * reenabled until after single-step mode ends. - * Without disabling interrupt on local CPU, there is a chance of - * interrupt occurrence in the period of exception return and start of - * out-of-line single-step, that result in wrongly single stepping - * into the interrupt handler. + * Mask all of DAIF while executing the instruction out-of-line, to keep things + * simple and avoid nesting exceptions. Interrupts do have to be disabled since + * the kprobe state is per-CPU and doesn't get migrated. */ static void __kprobes kprobes_save_local_irqflag(struct kprobe_ctlblk *kcb, struct pt_regs *regs) { kcb->saved_irqflag = regs->pstate & DAIF_MASK; - regs->pstate |= PSR_I_BIT; - /* Unmask PSTATE.D for enabling software step exceptions. */ - regs->pstate &= ~PSR_D_BIT; + regs->pstate |= DAIF_MASK; } static void __kprobes kprobes_restore_local_irqflag(struct kprobe_ctlblk *kcb, @@ -219,10 +210,7 @@ static void __kprobes setup_singlestep(struct kprobe *p, slot = (unsigned long)p->ainsn.api.insn; set_ss_context(kcb, slot); /* mark pending ss */ - - /* IRQs and single stepping do not mix well. */ kprobes_save_local_irqflag(kcb, regs); - kernel_enable_single_step(regs); instruction_pointer_set(regs, slot); } else { /* insn simulation */ @@ -273,12 +261,8 @@ post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) } /* call post handler */ kcb->kprobe_status = KPROBE_HIT_SSDONE; - if (cur->post_handler) { - /* post_handler can hit breakpoint and single step - * again, so we enable D-flag for recursive exception. - */ + if (cur->post_handler) cur->post_handler(cur, regs, 0); - } reset_current_kprobe(); } @@ -302,8 +286,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr) if (!instruction_pointer(regs)) BUG(); - kernel_disable_single_step(); - if (kcb->kprobe_status == KPROBE_REENTER) restore_previous_kprobe(kcb); else @@ -365,10 +347,6 @@ static void __kprobes kprobe_handler(struct pt_regs *regs) * pre-handler and it returned non-zero, it will * modify the execution path and no need to single * stepping. Let's just reset current kprobe and exit. - * - * pre_handler can hit a breakpoint and can step thru - * before return, keep PSTATE D-flag enabled until - * pre_handler return back. */ if (!p->pre_handler || !p->pre_handler(p, regs)) { setup_singlestep(p, regs, kcb, 0); @@ -399,7 +377,7 @@ kprobe_ss_hit(struct kprobe_ctlblk *kcb, unsigned long addr) } static int __kprobes -kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr) +kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned int esr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); int retval; @@ -409,16 +387,15 @@ kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr) if (retval == DBG_HOOK_HANDLED) { kprobes_restore_local_irqflag(kcb, regs); - kernel_disable_single_step(); - post_kprobe_handler(kcb, regs); } return retval; } -static struct step_hook kprobes_step_hook = { - .fn = kprobe_single_step_handler, +static struct break_hook kprobes_break_ss_hook = { + .imm = KPROBES_BRK_SS_IMM, + .fn = kprobe_breakpoint_ss_handler, }; static int __kprobes @@ -486,7 +463,7 @@ int __kprobes arch_trampoline_kprobe(struct kprobe *p) int __init arch_init_kprobes(void) { register_kernel_break_hook(&kprobes_break_hook); - register_kernel_step_hook(&kprobes_step_hook); + register_kernel_break_hook(&kprobes_break_ss_hook); return 0; } -- cgit v1.2.3 From b0e98aa9c411585eb586b2fa98873c936735008e Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Tue, 20 Oct 2020 20:20:07 +0200 Subject: s390/mm: make pmd/pud_deref() large page aware pmd/pud_deref() assume that they will never operate on large pmd/pud entries, and therefore only use the non-large _xxx_ENTRY_ORIGIN mask. With commit 9ec8fa8dc331b ("s390/vmemmap: extend modify_pagetable() to handle vmemmap"), that assumption is no longer true, at least for pmd_deref(). In theory, we could end up with wrong addresses because some of the non-address bits of a large entry would not be masked out. In practice, this does not (yet) show any impact, because vmemmap_free() is currently never used for s390. Fix pmd/pud_deref() to check for the entry type and use the _xxx_ENTRY_ORIGIN_LARGE mask for large entries. While at it, also move pmd/pud_pfn() around, in order to avoid code duplication, because they do the same thing. Fixes: 9ec8fa8dc331b ("s390/vmemmap: extend modify_pagetable() to handle vmemmap") Cc: # 5.9 Signed-off-by: Gerald Schaefer Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/include/asm/pgtable.h | 52 ++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 6b8d8c69b1a1..b5dbae78969b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -692,16 +692,6 @@ static inline int pud_large(pud_t pud) return !!(pud_val(pud) & _REGION3_ENTRY_LARGE); } -static inline unsigned long pud_pfn(pud_t pud) -{ - unsigned long origin_mask; - - origin_mask = _REGION_ENTRY_ORIGIN; - if (pud_large(pud)) - origin_mask = _REGION3_ENTRY_ORIGIN_LARGE; - return (pud_val(pud) & origin_mask) >> PAGE_SHIFT; -} - #define pmd_leaf pmd_large static inline int pmd_large(pmd_t pmd) { @@ -747,16 +737,6 @@ static inline int pmd_none(pmd_t pmd) return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY; } -static inline unsigned long pmd_pfn(pmd_t pmd) -{ - unsigned long origin_mask; - - origin_mask = _SEGMENT_ENTRY_ORIGIN; - if (pmd_large(pmd)) - origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE; - return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT; -} - #define pmd_write pmd_write static inline int pmd_write(pmd_t pmd) { @@ -1238,11 +1218,39 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) -#define pmd_deref(pmd) (pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN) -#define pud_deref(pud) (pud_val(pud) & _REGION_ENTRY_ORIGIN) #define p4d_deref(pud) (p4d_val(pud) & _REGION_ENTRY_ORIGIN) #define pgd_deref(pgd) (pgd_val(pgd) & _REGION_ENTRY_ORIGIN) +static inline unsigned long pmd_deref(pmd_t pmd) +{ + unsigned long origin_mask; + + origin_mask = _SEGMENT_ENTRY_ORIGIN; + if (pmd_large(pmd)) + origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE; + return pmd_val(pmd) & origin_mask; +} + +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return pmd_deref(pmd) >> PAGE_SHIFT; +} + +static inline unsigned long pud_deref(pud_t pud) +{ + unsigned long origin_mask; + + origin_mask = _REGION_ENTRY_ORIGIN; + if (pud_large(pud)) + origin_mask = _REGION3_ENTRY_ORIGIN_LARGE; + return pud_val(pud) & origin_mask; +} + +static inline unsigned long pud_pfn(pud_t pud) +{ + return pud_deref(pud) >> PAGE_SHIFT; +} + /* * The pgd_offset function *always* adds the index for the top-level * region/segment table. This is done to get a sequence like the -- cgit v1.2.3 From e99198661ecd02545b926ba40d1e91626bb29647 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 26 Oct 2020 16:15:22 +0100 Subject: s390/vdso: remove empty unused file Signed-off-by: Heiko Carstens --- arch/s390/include/asm/vdso/vdso.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 arch/s390/include/asm/vdso/vdso.h (limited to 'arch') diff --git a/arch/s390/include/asm/vdso/vdso.h b/arch/s390/include/asm/vdso/vdso.h deleted file mode 100644 index e69de29bb2d1..000000000000 -- cgit v1.2.3 From cfef9aa69a7382a205661a83e621114b37824474 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 26 Oct 2020 16:15:22 +0100 Subject: s390/vdso: remove unused constants Signed-off-by: Heiko Carstens --- arch/s390/kernel/asm-offsets.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index ece58f2217cb..2012c1cf0853 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -61,14 +61,6 @@ int main(void) BLANK(); OFFSET(__VDSO_GETCPU_VAL, vdso_per_cpu_data, getcpu_val); BLANK(); - /* constants used by the vdso */ - DEFINE(__CLOCK_REALTIME, CLOCK_REALTIME); - DEFINE(__CLOCK_MONOTONIC, CLOCK_MONOTONIC); - DEFINE(__CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); - DEFINE(__CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE); - DEFINE(__CLOCK_THREAD_CPUTIME_ID, CLOCK_THREAD_CPUTIME_ID); - DEFINE(__CLOCK_COARSE_RES, LOW_RES_NSEC); - BLANK(); /* idle data offsets */ OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter); OFFSET(__CLOCK_IDLE_EXIT, s390_idle_data, clock_idle_exit); -- cgit v1.2.3 From c3d9cdca73d0e49f01a71cdc477a09b04b1b30fc Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 26 Oct 2020 19:52:04 +0100 Subject: s390: update defconfigs Signed-off-by: Heiko Carstens --- arch/s390/configs/debug_defconfig | 10 ++++++---- arch/s390/configs/defconfig | 9 +++++---- arch/s390/configs/zfcpdump_defconfig | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 0784bf3caf43..a4d3c578fbd8 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -93,9 +93,10 @@ CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_CMA_DEBUG=y CONFIG_CMA_DEBUGFS=y +CONFIG_CMA_AREAS=7 CONFIG_MEM_SOFT_DIRTY=y CONFIG_ZSWAP=y -CONFIG_ZSMALLOC=m +CONFIG_ZSMALLOC=y CONFIG_ZSMALLOC_STAT=y CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y @@ -378,7 +379,6 @@ CONFIG_NETLINK_DIAG=m CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m -# CONFIG_NET_DROP_MONITOR is not set CONFIG_PCI=y # CONFIG_PCIEASPM is not set CONFIG_PCI_DEBUG=y @@ -386,7 +386,7 @@ CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y CONFIG_DEVTMPFS=y CONFIG_CONNECTOR=y -CONFIG_ZRAM=m +CONFIG_ZRAM=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_DRBD=m @@ -689,6 +689,7 @@ CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECRDSA=m +CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_GCM=y CONFIG_CRYPTO_CHACHA20POLY1305=m @@ -709,7 +710,6 @@ CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -753,6 +753,7 @@ CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_CRC32_S390=y +CONFIG_CRYPTO_DEV_VIRTIO=m CONFIG_CORDIC=m CONFIG_CRC32_SELFTEST=y CONFIG_CRC4=m @@ -829,6 +830,7 @@ CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m CONFIG_FAULT_INJECTION=y CONFIG_FAILSLAB=y CONFIG_FAIL_PAGE_ALLOC=y +CONFIG_FAULT_INJECTION_USERCOPY=y CONFIG_FAIL_MAKE_REQUEST=y CONFIG_FAIL_IO_TIMEOUT=y CONFIG_FAIL_FUTEX=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 905bc8c4cfaf..17d5df2c1eff 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -87,9 +87,10 @@ CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y +CONFIG_CMA_AREAS=7 CONFIG_MEM_SOFT_DIRTY=y CONFIG_ZSWAP=y -CONFIG_ZSMALLOC=m +CONFIG_ZSMALLOC=y CONFIG_ZSMALLOC_STAT=y CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y @@ -371,7 +372,6 @@ CONFIG_NETLINK_DIAG=m CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m -# CONFIG_NET_DROP_MONITOR is not set CONFIG_PCI=y # CONFIG_PCIEASPM is not set CONFIG_HOTPLUG_PCI=y @@ -379,7 +379,7 @@ CONFIG_HOTPLUG_PCI_S390=y CONFIG_UEVENT_HELPER=y CONFIG_DEVTMPFS=y CONFIG_CONNECTOR=y -CONFIG_ZRAM=m +CONFIG_ZRAM=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_DRBD=m @@ -680,6 +680,7 @@ CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECRDSA=m +CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_GCM=y CONFIG_CRYPTO_CHACHA20POLY1305=m @@ -701,7 +702,6 @@ CONFIG_CRYPTO_RMD160=m CONFIG_CRYPTO_RMD256=m CONFIG_CRYPTO_RMD320=m CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_SM3=m CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_AES_TI=m @@ -745,6 +745,7 @@ CONFIG_CRYPTO_DES_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_CRC32_S390=y +CONFIG_CRYPTO_DEV_VIRTIO=m CONFIG_CORDIC=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC4=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 8f67c55625f9..a302630341ef 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -17,11 +17,11 @@ CONFIG_HZ_100=y # CONFIG_CHSC_SCH is not set # CONFIG_SCM_BUS is not set CONFIG_CRASH_DUMP=y -# CONFIG_SECCOMP is not set # CONFIG_PFAULT is not set # CONFIG_S390_HYPFS_FS is not set # CONFIG_VIRTUALIZATION is not set # CONFIG_S390_GUEST is not set +# CONFIG_SECCOMP is not set CONFIG_PARTITION_ADVANCED=y CONFIG_IBM_PARTITION=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -- cgit v1.2.3 From de5d9dae150ca1c1b5c7676711a9ca139d1a8dec Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 28 Oct 2020 14:27:42 -0400 Subject: s390/smp: move rcu_cpu_starting() earlier The call to rcu_cpu_starting() in smp_init_secondary() is not early enough in the CPU-hotplug onlining process, which results in lockdep splats as follows: WARNING: suspicious RCU usage ----------------------------- kernel/locking/lockdep.c:3497 RCU-list traversed in non-reader section!! other info that might help us debug this: RCU used illegally from offline CPU! rcu_scheduler_active = 1, debug_locks = 1 no locks held by swapper/1/0. Call Trace: show_stack+0x158/0x1f0 dump_stack+0x1f2/0x238 __lock_acquire+0x2640/0x4dd0 lock_acquire+0x3a8/0xd08 _raw_spin_lock_irqsave+0xc0/0xf0 clockevents_register_device+0xa8/0x528 init_cpu_timer+0x33e/0x468 smp_init_secondary+0x11a/0x328 smp_start_secondary+0x82/0x88 This is avoided by moving the call to rcu_cpu_starting up near the beginning of the smp_init_secondary() function. Note that the raw_smp_processor_id() is required in order to avoid calling into lockdep before RCU has declared the CPU to be watched for readers. Link: https://lore.kernel.org/lkml/160223032121.7002.1269740091547117869.tip-bot2@tip-bot2/ Signed-off-by: Qian Cai Acked-by: Paul E. McKenney Signed-off-by: Heiko Carstens --- arch/s390/kernel/smp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index ebfe86d097f0..390d97daa2b3 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -855,13 +855,14 @@ void __init smp_detect_cpus(void) static void smp_init_secondary(void) { - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); S390_lowcore.last_update_clock = get_tod_clock(); restore_access_regs(S390_lowcore.access_regs_save_area); set_cpu_flag(CIF_ASCE_PRIMARY); set_cpu_flag(CIF_ASCE_SECONDARY); cpu_init(); + rcu_cpu_starting(cpu); preempt_disable(); init_cpu_timer(); vtime_init(); -- cgit v1.2.3 From 0b2ca2c7d0c9e2731d01b6c862375d44a7e13923 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Mon, 2 Nov 2020 11:33:04 +0100 Subject: s390/pci: fix hot-plug of PCI function missing bus Under some circumstances in particular with "Reconfigure I/O Path" a zPCI function may first appear in Standby through a PCI event with PEC 0x0302 which initially makes it visible to the zPCI subsystem, Only after that is it configured with a zPCI event with PEC 0x0301. If the zbus is still missing a PCI function zero (devfn == 0) when the PCI event 0x0301 is handled zdev->zbus->bus is still NULL and gets dereferenced in common code. Check for this case and enable but don't scan the zPCI function. This matches what would happen if we immediately got the 0x0301 configuration request or the function was included in CLP List PCI. In all cases the PCI functions with devfn != 0 will be scanned once function 0 appears. Fixes: 3047766bc6ec ("s390/pci: fix enabling a reserved PCI function") Cc: # 5.8 Signed-off-by: Niklas Schnelle Acked-by: Pierre Morel Signed-off-by: Heiko Carstens --- arch/s390/pci/pci_event.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index d33f21545dfd..9a6bae503fe6 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -101,6 +101,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) if (ret) break; + /* the PCI function will be scanned once function 0 appears */ + if (!zdev->zbus->bus) + break; + pdev = pci_scan_single_device(zdev->zbus->bus, zdev->devfn); if (!pdev) break; -- cgit v1.2.3 From b9bc36704cca500e2b41be4c5bf615c1d7ddc3ce Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 31 Oct 2020 11:43:45 +0200 Subject: ARM, xtensa: highmem: avoid clobbering non-page aligned memory reservations free_highpages() iterates over the free memblock regions in high memory, and marks each page as available for the memory management system. Until commit cddb5ddf2b76 ("arm, xtensa: simplify initialization of high memory pages") it rounded beginning of each region upwards and end of each region downwards. However, after that commit free_highmem() rounds the beginning and end of each region downwards, and we may end up freeing a page that is memblock_reserve()d, resulting in memory corruption. Restore the original rounding of the region boundaries to avoid freeing reserved pages. Fixes: cddb5ddf2b76 ("arm, xtensa: simplify initialization of high memory pages") Link: https://lore.kernel.org/r/20201029110334.4118-1-ardb@kernel.org/ Link: https://lore.kernel.org/r/20201031094345.6984-1-rppt@kernel.org Signed-off-by: Ard Biesheuvel Co-developed-by: Mike Rapoport Signed-off-by: Mike Rapoport Acked-by: Max Filippov --- arch/arm/mm/init.c | 4 ++-- arch/xtensa/mm/init.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index d57112a276f5..c23dbf8bebee 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -354,8 +354,8 @@ static void __init free_highpages(void) /* set highmem page free */ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &range_start, &range_end, NULL) { - unsigned long start = PHYS_PFN(range_start); - unsigned long end = PHYS_PFN(range_end); + unsigned long start = PFN_UP(range_start); + unsigned long end = PFN_DOWN(range_end); /* Ignore complete lowmem entries */ if (end <= max_low) diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index c6fc83efee0c..8731b7ad9308 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -89,8 +89,8 @@ static void __init free_highpages(void) /* set highmem page free */ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &range_start, &range_end, NULL) { - unsigned long start = PHYS_PFN(range_start); - unsigned long end = PHYS_PFN(range_end); + unsigned long start = PFN_UP(range_start); + unsigned long end = PFN_DOWN(range_end); /* Ignore complete lowmem entries */ if (end <= max_low) -- cgit v1.2.3 From 4d6ffa27b8e5116c0abb318790fd01d4e12d75e6 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 2 Nov 2020 17:23:58 -0800 Subject: x86/lib: Change .weak to SYM_FUNC_START_WEAK for arch/x86/lib/mem*_64.S Commit 393f203f5fd5 ("x86_64: kasan: add interceptors for memset/memmove/memcpy functions") added .weak directives to arch/x86/lib/mem*_64.S instead of changing the existing ENTRY macros to WEAK. This can lead to the assembly snippet .weak memcpy ... .globl memcpy which will produce a STB_WEAK memcpy with GNU as but STB_GLOBAL memcpy with LLVM's integrated assembler before LLVM 12. LLVM 12 (since https://reviews.llvm.org/D90108) will error on such an overridden symbol binding. Commit ef1e03152cb0 ("x86/asm: Make some functions local") changed ENTRY in arch/x86/lib/memcpy_64.S to SYM_FUNC_START_LOCAL, which was ineffective due to the preceding .weak directive. Use the appropriate SYM_FUNC_START_WEAK instead. Fixes: 393f203f5fd5 ("x86_64: kasan: add interceptors for memset/memmove/memcpy functions") Fixes: ef1e03152cb0 ("x86/asm: Make some functions local") Reported-by: Sami Tolvanen Signed-off-by: Fangrui Song Signed-off-by: Borislav Petkov Reviewed-by: Nick Desaulniers Tested-by: Nathan Chancellor Tested-by: Nick Desaulniers Cc: Link: https://lkml.kernel.org/r/20201103012358.168682-1-maskray@google.com --- arch/x86/lib/memcpy_64.S | 4 +--- arch/x86/lib/memmove_64.S | 4 +--- arch/x86/lib/memset_64.S | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 037faac46b0c..1e299ac73c86 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -16,8 +16,6 @@ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. */ -.weak memcpy - /* * memcpy - Copy a memory block. * @@ -30,7 +28,7 @@ * rax original destination */ SYM_FUNC_START_ALIAS(__memcpy) -SYM_FUNC_START_LOCAL(memcpy) +SYM_FUNC_START_WEAK(memcpy) ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ "jmp memcpy_erms", X86_FEATURE_ERMS diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 7ff00ea64e4f..41902fe8b859 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -24,9 +24,7 @@ * Output: * rax: dest */ -.weak memmove - -SYM_FUNC_START_ALIAS(memmove) +SYM_FUNC_START_WEAK(memmove) SYM_FUNC_START(__memmove) mov %rdi, %rax diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 9ff15ee404a4..0bfd26e4ca9e 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -6,8 +6,6 @@ #include #include -.weak memset - /* * ISO C memset - set a memory block to a byte value. This function uses fast * string to get better performance than the original function. The code is @@ -19,7 +17,7 @@ * * rax original destination */ -SYM_FUNC_START_ALIAS(memset) +SYM_FUNC_START_WEAK(memset) SYM_FUNC_START(__memset) /* * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended -- cgit v1.2.3 From 9d750c75bd2c3fcf20a3c15378d1bc6b2d4ec31f Mon Sep 17 00:00:00 2001 From: Ryan Kosta Date: Sat, 10 Oct 2020 20:03:51 -0700 Subject: risc-v: kernel: ftrace: Fixes improper SPDX comment style Signed-off-by: Ryan Kosta Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 99e12faa5498..765b62434f30 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Linaro Limited * Author: AKASHI Takahiro -- cgit v1.2.3 From 1344a232016dbb0492be81f8517c4bf8fc1c6610 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 4 Nov 2020 22:17:42 +1100 Subject: powerpc: Use asm_goto_volatile for put_user() Andreas reported that commit ee0a49a6870e ("powerpc/uaccess: Switch __put_user_size_allowed() to __put_user_asm_goto()") broke CLONE_CHILD_SETTID. Further inspection showed that the put_user() in schedule_tail() was missing entirely, the store not emitted by the compiler. <.schedule_tail>: mflr r0 std r0,16(r1) stdu r1,-112(r1) bl <.finish_task_switch> ld r9,2496(r3) cmpdi cr7,r9,0 bne cr7,<.schedule_tail+0x60> ld r3,392(r13) ld r9,1392(r3) cmpdi cr7,r9,0 beq cr7,<.schedule_tail+0x3c> li r4,0 li r5,0 bl <.__task_pid_nr_ns> nop bl <.calculate_sigpending> nop addi r1,r1,112 ld r0,16(r1) mtlr r0 blr nop nop nop bl <.__balance_callback> b <.schedule_tail+0x1c> Notice there are no stores other than to the stack. There should be a stw in there for the store to current->set_child_tid. This is only seen with GCC 4.9 era compilers (tested with 4.9.3 and 4.9.4), and only when CONFIG_PPC_KUAP is disabled. When CONFIG_PPC_KUAP=y, the inline asm that's part of the isync() and mtspr() inlined via allow_user_access() seems to be enough to avoid the bug. We already have a macro to work around this (or a similar bug), called asm_volatile_goto which includes an empty asm block to tickle the compiler into generating the right code. So use that. With this applied the code generation looks more like it will work: <.schedule_tail>: mflr r0 std r31,-8(r1) std r0,16(r1) stdu r1,-144(r1) std r3,112(r1) bl <._mcount> nop ld r3,112(r1) bl <.finish_task_switch> ld r9,2624(r3) cmpdi cr7,r9,0 bne cr7,<.schedule_tail+0xa0> ld r3,2408(r13) ld r31,1856(r3) cmpdi cr7,r31,0 beq cr7,<.schedule_tail+0x80> li r4,0 li r5,0 bl <.__task_pid_nr_ns> nop li r9,-1 clrldi r9,r9,12 cmpld cr7,r31,r9 bgt cr7,<.schedule_tail+0x80> lis r9,16 rldicr r9,r9,32,31 subf r9,r31,r9 cmpldi cr7,r9,3 ble cr7,<.schedule_tail+0x80> li r9,0 stw r3,0(r31) <-- stw nop bl <.calculate_sigpending> nop addi r1,r1,144 ld r0,16(r1) ld r31,-8(r1) mtlr r0 blr nop bl <.__balance_callback> b <.schedule_tail+0x30> Fixes: ee0a49a6870e ("powerpc/uaccess: Switch __put_user_size_allowed() to __put_user_asm_goto()") Reported-by: Andreas Schwab Tested-by: Andreas Schwab Suggested-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201104111742.672142-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/uaccess.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index ef5bbb705c08..501c9a79038c 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -178,7 +178,7 @@ do { \ * are no aliasing issues. */ #define __put_user_asm_goto(x, addr, label, op) \ - asm volatile goto( \ + asm_volatile_goto( \ "1: " op "%U1%X1 %0,%1 # put_user\n" \ EX_TABLE(1b, %l2) \ : \ @@ -191,7 +191,7 @@ do { \ __put_user_asm_goto(x, ptr, label, "std") #else /* __powerpc64__ */ #define __put_user_asm2_goto(x, addr, label) \ - asm volatile goto( \ + asm_volatile_goto( \ "1: stw%X1 %0, %1\n" \ "2: stw%X1 %L0, %L1\n" \ EX_TABLE(1b, %l2) \ -- cgit v1.2.3 From 11522448e641e8f1690c9db06e01985e8e19b401 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 10 Oct 2020 15:14:30 +0000 Subject: powerpc/603: Always fault when _PAGE_ACCESSED is not set The kernel expects pte_young() to work regardless of CONFIG_SWAP. Make sure a minor fault is taken to set _PAGE_ACCESSED when it is not already set, regardless of the selection of CONFIG_SWAP. Fixes: 84de6ab0e904 ("powerpc/603: don't handle PAGE_ACCESSED in TLB miss handlers.") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a44367744de54e2315b2f1a8cbbd7f88488072e0.1602342806.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_book3s_32.S | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 5eb9eedac920..2aa16d5368e1 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -457,11 +457,7 @@ InstructionTLBMiss: cmplw 0,r1,r3 #endif mfspr r2, SPRN_SPRG_PGDIR -#ifdef CONFIG_SWAP li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC -#else - li r1,_PAGE_PRESENT | _PAGE_EXEC -#endif #if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ @@ -523,11 +519,7 @@ DataLoadTLBMiss: lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 mfspr r2, SPRN_SPRG_PGDIR -#ifdef CONFIG_SWAP li r1, _PAGE_PRESENT | _PAGE_ACCESSED -#else - li r1, _PAGE_PRESENT -#endif bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ @@ -603,11 +595,7 @@ DataStoreTLBMiss: lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 mfspr r2, SPRN_SPRG_PGDIR -#ifdef CONFIG_SWAP li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED -#else - li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT -#endif bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ -- cgit v1.2.3 From 0540b0d2ce9073fd2a736d636218faa61c99e572 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 10 Oct 2020 15:14:29 +0000 Subject: powerpc/40x: Always fault when _PAGE_ACCESSED is not set The kernel expects pte_young() to work regardless of CONFIG_SWAP. Make sure a minor fault is taken to set _PAGE_ACCESSED when it is not already set, regardless of the selection of CONFIG_SWAP. Fixes: 2c74e2586bb9 ("powerpc/40x: Rework 40x PTE access and TLB miss") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b02ca2ed2d3676a096219b48c0f69ec982a75bcf.1602342801.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_40x.S | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 44c9018aed1b..a1ae00689e0f 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -284,11 +284,7 @@ _ENTRY(saved_ksp_limit) rlwimi r11, r10, 22, 20, 29 /* Compute PTE address */ lwz r11, 0(r11) /* Get Linux PTE */ -#ifdef CONFIG_SWAP li r9, _PAGE_PRESENT | _PAGE_ACCESSED -#else - li r9, _PAGE_PRESENT -#endif andc. r9, r9, r11 /* Check permission */ bne 5f @@ -369,11 +365,7 @@ _ENTRY(saved_ksp_limit) rlwimi r11, r10, 22, 20, 29 /* Compute PTE address */ lwz r11, 0(r11) /* Get Linux PTE */ -#ifdef CONFIG_SWAP li r9, _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC -#else - li r9, _PAGE_PRESENT | _PAGE_EXEC -#endif andc. r9, r9, r11 /* Check permission */ bne 5f -- cgit v1.2.3 From 29daf869cbab69088fe1755d9dd224e99ba78b56 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 12 Oct 2020 08:54:31 +0000 Subject: powerpc/8xx: Always fault when _PAGE_ACCESSED is not set The kernel expects pte_young() to work regardless of CONFIG_SWAP. Make sure a minor fault is taken to set _PAGE_ACCESSED when it is not already set, regardless of the selection of CONFIG_SWAP. This adds at least 3 instructions to the TLB miss exception handlers fast path. Following patch will reduce this overhead. Also update the rotation instruction to the correct number of bits to reflect all changes done to _PAGE_ACCESSED over time. Fixes: d069cb4373fe ("powerpc/8xx: Don't touch ACCESSED when no SWAP.") Fixes: 5f356497c384 ("powerpc/8xx: remove unused _PAGE_WRITETHRU") Fixes: e0a8e0d90a9f ("powerpc/8xx: Handle PAGE_USER via APG bits") Fixes: 5b2753fc3e8a ("powerpc/8xx: Implementation of PAGE_EXEC") Fixes: a891c43b97d3 ("powerpc/8xx: Prepare handlers for _PAGE_HUGE for 512k pages.") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/af834e8a0f1fa97bfae65664950f0984a70c4750.1602492856.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_8xx.S | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 9f359d3fba74..6f3799a04121 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -202,9 +202,7 @@ SystemCall: InstructionTLBMiss: mtspr SPRN_SPRG_SCRATCH0, r10 -#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_SWAP) || defined(CONFIG_HUGETLBFS) mtspr SPRN_SPRG_SCRATCH1, r11 -#endif /* If we are faulting a kernel address, we have to use the * kernel page tables. @@ -238,11 +236,9 @@ InstructionTLBMiss: rlwimi r11, r10, 32 - 9, _PMD_PAGE_512K mtspr SPRN_MI_TWC, r11 #endif -#ifdef CONFIG_SWAP - rlwinm r11, r10, 32-5, _PAGE_PRESENT + rlwinm r11, r10, 32-7, _PAGE_PRESENT and r11, r11, r10 rlwimi r10, r11, 0, _PAGE_PRESENT -#endif /* The Linux PTE won't go exactly into the MMU TLB. * Software indicator bits 20 and 23 must be clear. * Software indicator bits 22, 24, 25, 26, and 27 must be @@ -256,9 +252,7 @@ InstructionTLBMiss: /* Restore registers */ 0: mfspr r10, SPRN_SPRG_SCRATCH0 -#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_SWAP) || defined(CONFIG_HUGETLBFS) mfspr r11, SPRN_SPRG_SCRATCH1 -#endif rfi patch_site 0b, patch__itlbmiss_exit_1 @@ -268,9 +262,7 @@ InstructionTLBMiss: addi r10, r10, 1 stw r10, (itlb_miss_counter - PAGE_OFFSET)@l(0) mfspr r10, SPRN_SPRG_SCRATCH0 -#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_SWAP) mfspr r11, SPRN_SPRG_SCRATCH1 -#endif rfi #endif @@ -316,11 +308,9 @@ DataStoreTLBMiss: * r11 = ((r10 & PRESENT) & ((r10 & ACCESSED) >> 5)); * r10 = (r10 & ~PRESENT) | r11; */ -#ifdef CONFIG_SWAP - rlwinm r11, r10, 32-5, _PAGE_PRESENT + rlwinm r11, r10, 32-7, _PAGE_PRESENT and r11, r11, r10 rlwimi r10, r11, 0, _PAGE_PRESENT -#endif /* The Linux PTE won't go exactly into the MMU TLB. * Software indicator bits 24, 25, 26, and 27 must be * set. All other Linux PTE bits control the behavior -- cgit v1.2.3 From 33fe43cfd9b1c20f6f9899b44bf04e91823ff1c9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 12 Oct 2020 08:54:33 +0000 Subject: powerpc/8xx: Manage _PAGE_ACCESSED through APG bits in L1 entry When _PAGE_ACCESSED is not set, a minor fault is expected. To do this, TLB miss exception ANDs _PAGE_PRESENT and _PAGE_ACCESSED into the L2 entry valid bit. To simplify the processing and reduce the number of instructions in TLB miss exceptions, manage it as an APG bit and get it next to _PAGE_GUARDED bit to allow a copy in one go. Then declare the corresponding groups as handling all accesses as user accesses. As the PP bits always define user as No Access, it will generate a fault. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/80f488db230c6b0e7b3b990d72bd94a8a069e93e.1602492856.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/kup-8xx.h | 2 +- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 47 +++++++++------------------- arch/powerpc/include/asm/nohash/32/pte-8xx.h | 9 +++--- arch/powerpc/kernel/head_8xx.S | 36 +++++---------------- 4 files changed, 28 insertions(+), 66 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h index 85ed2390fb99..567cdc557402 100644 --- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -63,7 +63,7 @@ static inline void restore_user_access(unsigned long flags) static inline bool bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) { - return WARN(!((regs->kuap ^ MD_APG_KUAP) & 0xf0000000), + return WARN(!((regs->kuap ^ MD_APG_KUAP) & 0xff000000), "Bug: fault blocked by AP register !"); } diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 1d9ac0f9c794..0bd1b144eb76 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -33,19 +33,18 @@ * respectively NA for All or X for Supervisor and no access for User. * Then we use the APG to say whether accesses are according to Page rules or * "all Supervisor" rules (Access to all) - * Therefore, we define 2 APG groups. lsb is _PMD_USER - * 0 => Kernel => 01 (all accesses performed according to page definition) - * 1 => User => 00 (all accesses performed as supervisor iaw page definition) - * 2-15 => Not Used - */ -#define MI_APG_INIT 0x40000000 - -/* - * 0 => Kernel => 01 (all accesses performed according to page definition) - * 1 => User => 10 (all accesses performed according to swaped page definition) - * 2-15 => Not Used - */ -#define MI_APG_KUEP 0x60000000 + * _PAGE_ACCESSED is also managed via APG. When _PAGE_ACCESSED is not set, say + * "all User" rules, that will lead to NA for all. + * Therefore, we define 4 APG groups. lsb is _PAGE_ACCESSED + * 0 => Kernel => 11 (all accesses performed according as user iaw page definition) + * 1 => Kernel+Accessed => 01 (all accesses performed according to page definition) + * 2 => User => 11 (all accesses performed according as user iaw page definition) + * 3 => User+Accessed => 00 (all accesses performed as supervisor iaw page definition) for INIT + * => 10 (all accesses performed according to swaped page definition) for KUEP + * 4-15 => Not Used + */ +#define MI_APG_INIT 0xdc000000 +#define MI_APG_KUEP 0xde000000 /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MI_RPN is written, bits in @@ -106,25 +105,9 @@ #define MD_Ks 0x80000000 /* Should not be set */ #define MD_Kp 0x40000000 /* Should always be set */ -/* - * All pages' PP data bits are set to either 000 or 011 or 001, which means - * respectively RW for Supervisor and no access for User, or RO for - * Supervisor and no access for user and NA for ALL. - * Then we use the APG to say whether accesses are according to Page rules or - * "all Supervisor" rules (Access to all) - * Therefore, we define 2 APG groups. lsb is _PMD_USER - * 0 => Kernel => 01 (all accesses performed according to page definition) - * 1 => User => 00 (all accesses performed as supervisor iaw page definition) - * 2-15 => Not Used - */ -#define MD_APG_INIT 0x40000000 - -/* - * 0 => No user => 01 (all accesses performed according to page definition) - * 1 => User => 10 (all accesses performed according to swaped page definition) - * 2-15 => Not Used - */ -#define MD_APG_KUAP 0x60000000 +/* See explanation above at the definition of MI_APG_INIT */ +#define MD_APG_INIT 0xdc000000 +#define MD_APG_KUAP 0xde000000 /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MD_RPN is written, bits in diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 66f403a7da44..1581204467e1 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -39,9 +39,9 @@ * into the TLB. */ #define _PAGE_GUARDED 0x0010 /* Copied to L1 G entry in DTLB */ -#define _PAGE_SPECIAL 0x0020 /* SW entry */ +#define _PAGE_ACCESSED 0x0020 /* Copied to L1 APG 1 entry in I/DTLB */ #define _PAGE_EXEC 0x0040 /* Copied to PP (bit 21) in ITLB */ -#define _PAGE_ACCESSED 0x0080 /* software: page referenced */ +#define _PAGE_SPECIAL 0x0080 /* SW entry */ #define _PAGE_NA 0x0200 /* Supervisor NA, User no access */ #define _PAGE_RO 0x0600 /* Supervisor RO, User no access */ @@ -59,11 +59,12 @@ #define _PMD_PRESENT 0x0001 #define _PMD_PRESENT_MASK _PMD_PRESENT -#define _PMD_BAD 0x0fd0 +#define _PMD_BAD 0x0f90 #define _PMD_PAGE_MASK 0x000c #define _PMD_PAGE_8M 0x000c #define _PMD_PAGE_512K 0x0004 -#define _PMD_USER 0x0020 /* APG 1 */ +#define _PMD_ACCESSED 0x0020 /* APG 1 */ +#define _PMD_USER 0x0040 /* APG 2 */ #define _PTE_NONE_MASK 0 diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 6f3799a04121..ee0bfebc375f 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -222,23 +222,13 @@ InstructionTLBMiss: 3: mtcr r11 #endif -#if defined(CONFIG_HUGETLBFS) || !defined(CONFIG_PIN_TLB_TEXT) lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r10) /* Get level 1 entry */ mtspr SPRN_MD_TWC, r11 -#else - lwz r10, (swapper_pg_dir-PAGE_OFFSET)@l(r10) /* Get level 1 entry */ - mtspr SPRN_MI_TWC, r10 /* Set segment attributes */ - mtspr SPRN_MD_TWC, r10 -#endif mfspr r10, SPRN_MD_TWC lwz r10, 0(r10) /* Get the pte */ -#if defined(CONFIG_HUGETLBFS) || !defined(CONFIG_PIN_TLB_TEXT) + rlwimi r11, r10, 0, _PAGE_GUARDED | _PAGE_ACCESSED rlwimi r11, r10, 32 - 9, _PMD_PAGE_512K mtspr SPRN_MI_TWC, r11 -#endif - rlwinm r11, r10, 32-7, _PAGE_PRESENT - and r11, r11, r10 - rlwimi r10, r11, 0, _PAGE_PRESENT /* The Linux PTE won't go exactly into the MMU TLB. * Software indicator bits 20 and 23 must be clear. * Software indicator bits 22, 24, 25, 26, and 27 must be @@ -289,28 +279,16 @@ DataStoreTLBMiss: mfspr r10, SPRN_MD_TWC lwz r10, 0(r10) /* Get the pte */ - /* Insert the Guarded flag into the TWC from the Linux PTE. + /* Insert Guarded and Accessed flags into the TWC from the Linux PTE. * It is bit 27 of both the Linux PTE and the TWC (at least * I got that right :-). It will be better when we can put * this into the Linux pgd/pmd and load it in the operation * above. */ - rlwimi r11, r10, 0, _PAGE_GUARDED + rlwimi r11, r10, 0, _PAGE_GUARDED | _PAGE_ACCESSED rlwimi r11, r10, 32 - 9, _PMD_PAGE_512K mtspr SPRN_MD_TWC, r11 - /* Both _PAGE_ACCESSED and _PAGE_PRESENT has to be set. - * We also need to know if the insn is a load/store, so: - * Clear _PAGE_PRESENT and load that which will - * trap into DTLB Error with store bit set accordinly. - */ - /* PRESENT=0x1, ACCESSED=0x20 - * r11 = ((r10 & PRESENT) & ((r10 & ACCESSED) >> 5)); - * r10 = (r10 & ~PRESENT) | r11; - */ - rlwinm r11, r10, 32-7, _PAGE_PRESENT - and r11, r11, r10 - rlwimi r10, r11, 0, _PAGE_PRESENT /* The Linux PTE won't go exactly into the MMU TLB. * Software indicator bits 24, 25, 26, and 27 must be * set. All other Linux PTE bits control the behavior @@ -701,7 +679,7 @@ initial_mmu: li r9, 4 /* up to 4 pages of 8M */ mtctr r9 lis r9, KERNELBASE@h /* Create vaddr for TLB */ - li r10, MI_PS8MEG | MI_SVALID /* Set 8M byte page */ + li r10, MI_PS8MEG | _PMD_ACCESSED | MI_SVALID li r11, MI_BOOTINIT /* Create RPN for address 0 */ 1: mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ @@ -765,7 +743,7 @@ _GLOBAL(mmu_pin_tlb) #ifdef CONFIG_PIN_TLB_TEXT LOAD_REG_IMMEDIATE(r5, 28 << 8) LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET) - LOAD_REG_IMMEDIATE(r7, MI_SVALID | MI_PS8MEG) + LOAD_REG_IMMEDIATE(r7, MI_SVALID | MI_PS8MEG | _PMD_ACCESSED) LOAD_REG_IMMEDIATE(r8, 0xf0 | _PAGE_RO | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT) LOAD_REG_ADDR(r9, _sinittext) li r0, 4 @@ -787,7 +765,7 @@ _GLOBAL(mmu_pin_tlb) LOAD_REG_IMMEDIATE(r5, 28 << 8 | MD_TWAM) #ifdef CONFIG_PIN_TLB_DATA LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET) - LOAD_REG_IMMEDIATE(r7, MI_SVALID | MI_PS8MEG) + LOAD_REG_IMMEDIATE(r7, MI_SVALID | MI_PS8MEG | _PMD_ACCESSED) #ifdef CONFIG_PIN_TLB_IMMR li r0, 3 #else @@ -824,7 +802,7 @@ _GLOBAL(mmu_pin_tlb) #endif #ifdef CONFIG_PIN_TLB_IMMR LOAD_REG_IMMEDIATE(r0, VIRT_IMMR_BASE | MD_EVALID) - LOAD_REG_IMMEDIATE(r7, MD_SVALID | MD_PS512K | MD_GUARDED) + LOAD_REG_IMMEDIATE(r7, MD_SVALID | MD_PS512K | MD_GUARDED | _PMD_ACCESSED) mfspr r8, SPRN_IMMR rlwinm r8, r8, 0, 0xfff80000 ori r8, r8, 0xf0 | _PAGE_DIRTY | _PAGE_SPS | _PAGE_SH | \ -- cgit v1.2.3 From 1bd14a66ee5200d6a24419cbd2e0a0fccd4da36f Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 7 Oct 2020 14:51:59 -0700 Subject: RISC-V: Remove any memblock representing unusable memory area RISC-V limits the physical memory size by -PAGE_OFFSET. Any memory beyond that size from DRAM start is unusable. Just remove any memblock pointing to those memory region without worrying about computing the maximum size. Signed-off-by: Atish Patra Reviewed-by: Mike Rapoport Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index ea933b789a88..42c14eb6bc43 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -154,9 +154,8 @@ disable: void __init setup_bootmem(void) { - phys_addr_t mem_size = 0; - phys_addr_t total_mem = 0; - phys_addr_t mem_start, start, end = 0; + phys_addr_t mem_start = 0; + phys_addr_t start, end = 0; phys_addr_t vmlinux_end = __pa_symbol(&_end); phys_addr_t vmlinux_start = __pa_symbol(&_start); u64 i; @@ -164,21 +163,18 @@ void __init setup_bootmem(void) /* Find the memory region containing the kernel */ for_each_mem_range(i, &start, &end) { phys_addr_t size = end - start; - if (!total_mem) + if (!mem_start) mem_start = start; if (start <= vmlinux_start && vmlinux_end <= end) BUG_ON(size == 0); - total_mem = total_mem + size; } /* - * Remove memblock from the end of usable area to the - * end of region + * The maximal physical memory size is -PAGE_OFFSET. + * Make sure that any memory beyond mem_start + (-PAGE_OFFSET) is removed + * as it is unusable by kernel. */ - mem_size = min(total_mem, (phys_addr_t)-PAGE_OFFSET); - if (mem_start + mem_size < end) - memblock_remove(mem_start + mem_size, - end - mem_start - mem_size); + memblock_enforce_memory_limit(mem_start - PAGE_OFFSET); /* Reserve from the start of the kernel to the end of the kernel */ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); -- cgit v1.2.3 From 1978b3a53a74e3230cd46932b149c6e62e832e9a Mon Sep 17 00:00:00 2001 From: Anand K Mistry Date: Thu, 5 Nov 2020 16:33:04 +1100 Subject: x86/speculation: Allow IBPB to be conditionally enabled on CPUs with always-on STIBP On AMD CPUs which have the feature X86_FEATURE_AMD_STIBP_ALWAYS_ON, STIBP is set to on and spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED At the same time, IBPB can be set to conditional. However, this leads to the case where it's impossible to turn on IBPB for a process because in the PR_SPEC_DISABLE case in ib_prctl_set() the spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED condition leads to a return before the task flag is set. Similarly, ib_prctl_get() will return PR_SPEC_DISABLE even though IBPB is set to conditional. More generally, the following cases are possible: 1. STIBP = conditional && IBPB = on for spectre_v2_user=seccomp,ibpb 2. STIBP = on && IBPB = conditional for AMD CPUs with X86_FEATURE_AMD_STIBP_ALWAYS_ON The first case functions correctly today, but only because spectre_v2_user_ibpb isn't updated to reflect the IBPB mode. At a high level, this change does one thing. If either STIBP or IBPB is set to conditional, allow the prctl to change the task flag. Also, reflect that capability when querying the state. This isn't perfect since it doesn't take into account if only STIBP or IBPB is unconditionally on. But it allows the conditional feature to work as expected, without affecting the unconditional one. [ bp: Massage commit message and comment; space out statements for better readability. ] Fixes: 21998a351512 ("x86/speculation: Avoid force-disabling IBPB based on STIBP and enhanced IBRS.") Signed-off-by: Anand K Mistry Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Acked-by: Tom Lendacky Link: https://lkml.kernel.org/r/20201105163246.v2.1.Ifd7243cd3e2c2206a893ad0a5b9a4f19549e22c6@changeid --- arch/x86/kernel/cpu/bugs.c | 51 ++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d3f0db463f96..581fb7223ad0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1254,6 +1254,14 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) return 0; } +static bool is_spec_ib_user_controlled(void) +{ + return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP; +} + static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) { switch (ctrl) { @@ -1261,16 +1269,26 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return 0; + /* - * Indirect branch speculation is always disabled in strict - * mode. It can neither be enabled if it was force-disabled - * by a previous prctl call. + * With strict mode for both IBPB and STIBP, the instruction + * code paths avoid checking this task flag and instead, + * unconditionally run the instruction. However, STIBP and IBPB + * are independent and either can be set to conditionally + * enabled regardless of the mode of the other. + * + * If either is set to conditional, allow the task flag to be + * updated, unless it was force-disabled by a previous prctl + * call. Currently, this is possible on an AMD CPU which has the + * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the + * kernel is booted with 'spectre_v2_user=seccomp', then + * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and + * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED. */ - if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED || + if (!is_spec_ib_user_controlled() || task_spec_ib_force_disable(task)) return -EPERM; + task_clear_spec_ib_disable(task); task_update_spec_tif(task); break; @@ -1283,10 +1301,10 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return -EPERM; - if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + + if (!is_spec_ib_user_controlled()) return 0; + task_set_spec_ib_disable(task); if (ctrl == PR_SPEC_FORCE_DISABLE) task_set_spec_ib_force_disable(task); @@ -1351,20 +1369,17 @@ static int ib_prctl_get(struct task_struct *task) if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return PR_SPEC_ENABLE; - else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) - return PR_SPEC_DISABLE; - else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || - spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || - spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || - spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) { + else if (is_spec_ib_user_controlled()) { if (task_spec_ib_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ib_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - } else + } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + return PR_SPEC_DISABLE; + else return PR_SPEC_NOT_AFFECTED; } -- cgit v1.2.3 From 108aa503657ee2fe8aa071dc620d96372c252ecd Mon Sep 17 00:00:00 2001 From: Benjamin Gwin Date: Tue, 3 Nov 2020 12:11:06 -0800 Subject: arm64: kexec_file: try more regions if loading segments fails It's possible that the first region picked for the new kernel will make it impossible to fit the other segments in the required 32GB window, especially if we have a very large initrd. Instead of giving up, we can keep testing other regions for the kernel until we find one that works. Suggested-by: Ryan O'Leary Signed-off-by: Benjamin Gwin Link: https://lore.kernel.org/r/20201103201106.2397844-1-bgwin@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/kexec_image.c | 41 +++++++++++++++++++++++++--------- arch/arm64/kernel/machine_kexec_file.c | 9 +++++++- 2 files changed, 39 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/kexec_image.c b/arch/arm64/kernel/kexec_image.c index af9987c154ca..66adee8b5fc8 100644 --- a/arch/arm64/kernel/kexec_image.c +++ b/arch/arm64/kernel/kexec_image.c @@ -43,7 +43,7 @@ static void *image_load(struct kimage *image, u64 flags, value; bool be_image, be_kernel; struct kexec_buf kbuf; - unsigned long text_offset; + unsigned long text_offset, kernel_segment_number; struct kexec_segment *kernel_segment; int ret; @@ -88,11 +88,37 @@ static void *image_load(struct kimage *image, /* Adjust kernel segment with TEXT_OFFSET */ kbuf.memsz += text_offset; - ret = kexec_add_buffer(&kbuf); - if (ret) + kernel_segment_number = image->nr_segments; + + /* + * The location of the kernel segment may make it impossible to satisfy + * the other segment requirements, so we try repeatedly to find a + * location that will work. + */ + while ((ret = kexec_add_buffer(&kbuf)) == 0) { + /* Try to load additional data */ + kernel_segment = &image->segment[kernel_segment_number]; + ret = load_other_segments(image, kernel_segment->mem, + kernel_segment->memsz, initrd, + initrd_len, cmdline); + if (!ret) + break; + + /* + * We couldn't find space for the other segments; erase the + * kernel segment and try the next available hole. + */ + image->nr_segments -= 1; + kbuf.buf_min = kernel_segment->mem + kernel_segment->memsz; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + } + + if (ret) { + pr_err("Could not find any suitable kernel location!"); return ERR_PTR(ret); + } - kernel_segment = &image->segment[image->nr_segments - 1]; + kernel_segment = &image->segment[kernel_segment_number]; kernel_segment->mem += text_offset; kernel_segment->memsz -= text_offset; image->start = kernel_segment->mem; @@ -101,12 +127,7 @@ static void *image_load(struct kimage *image, kernel_segment->mem, kbuf.bufsz, kernel_segment->memsz); - /* Load additional data */ - ret = load_other_segments(image, - kernel_segment->mem, kernel_segment->memsz, - initrd, initrd_len, cmdline); - - return ERR_PTR(ret); + return 0; } #ifdef CONFIG_KEXEC_IMAGE_VERIFY_SIG diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 5b0e67b93cdc..03210f644790 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -240,6 +240,11 @@ static int prepare_elf_headers(void **addr, unsigned long *sz) return ret; } +/* + * Tries to add the initrd and DTB to the image. If it is not possible to find + * valid locations, this function will undo changes to the image and return non + * zero. + */ int load_other_segments(struct kimage *image, unsigned long kernel_load_addr, unsigned long kernel_size, @@ -248,7 +253,8 @@ int load_other_segments(struct kimage *image, { struct kexec_buf kbuf; void *headers, *dtb = NULL; - unsigned long headers_sz, initrd_load_addr = 0, dtb_len; + unsigned long headers_sz, initrd_load_addr = 0, dtb_len, + orig_segments = image->nr_segments; int ret = 0; kbuf.image = image; @@ -334,6 +340,7 @@ int load_other_segments(struct kimage *image, return 0; out_err: + image->nr_segments = orig_segments; vfree(dtb); return ret; } -- cgit v1.2.3 From 79605f1394261995c2b955c906a5a20fb27cdc84 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 22 Oct 2020 16:30:12 -0400 Subject: riscv: Set text_offset correctly for M-Mode M-Mode Linux is loaded at the start of RAM, not 2MB later. Perhaps this should be calculated based on PAGE_OFFSET somehow? Even better would be to deprecate text_offset and instead introduce something absolute. Signed-off-by: Sean Anderson Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/head.S | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 11e2a4fe66e0..7e849797c9c3 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -35,12 +35,17 @@ ENTRY(_start) .word 0 #endif .balign 8 +#ifdef CONFIG_RISCV_M_MODE + /* Image load offset (0MB) from start of RAM for M-mode */ + .dword 0 +#else #if __riscv_xlen == 64 /* Image load offset(2MB) from start of RAM */ .dword 0x200000 #else /* Image load offset(4MB) from start of RAM */ .dword 0x400000 +#endif #endif /* Effective size of kernel image */ .dword _end - _start -- cgit v1.2.3 From 3fb4a8fa28b740709bdd3229b80279957f4d37ed Mon Sep 17 00:00:00 2001 From: Scott Cheloha Date: Thu, 5 Nov 2020 16:30:40 -0600 Subject: powerpc/numa: Fix build when CONFIG_NUMA=n Add a non-NUMA definition for of_drconf_to_nid_single() to topology.h so we have one even if powerpc/mm/numa.c is not compiled. On a non-NUMA kernel the appropriate node id is always first_online_node. Fixes: 72cdd117c449 ("pseries/hotplug-memory: hot-add: skip redundant LMB lookup") Reported-by: kernel test robot Signed-off-by: Scott Cheloha Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201105223040.3612663-1-cheloha@linux.ibm.com --- arch/powerpc/include/asm/topology.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 8728590f514a..3beeb030cd78 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -6,6 +6,7 @@ struct device; struct device_node; +struct drmem_lmb; #ifdef CONFIG_NUMA @@ -61,6 +62,9 @@ static inline int early_cpu_to_node(int cpu) */ return (nid < 0) ? 0 : nid; } + +int of_drconf_to_nid_single(struct drmem_lmb *lmb); + #else static inline int early_cpu_to_node(int cpu) { return 0; } @@ -84,10 +88,12 @@ static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) return 0; } -#endif /* CONFIG_NUMA */ +static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb) +{ + return first_online_node; +} -struct drmem_lmb; -int of_drconf_to_nid_single(struct drmem_lmb *lmb); +#endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) extern int find_and_online_cpu_nid(int cpu); -- cgit v1.2.3 From bcacf5f6f239a9e60287680514f392748cb4ec39 Mon Sep 17 00:00:00 2001 From: Liu Shaohua Date: Mon, 26 Oct 2020 20:26:54 +0800 Subject: riscv: fix pfn_to_virt err in do_page_fault(). The argument to pfn_to_virt() should be pfn not the value of CSR_SATP. Reviewed-by: Palmer Dabbelt Reviewed-by: Anup Patel Signed-off-by: liush Reviewed-by: Pekka Enberg Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/fault.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 1359e21c0c62..3c8b9e433c67 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -86,6 +86,7 @@ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long a pmd_t *pmd, *pmd_k; pte_t *pte_k; int index; + unsigned long pfn; /* User mode accesses just cause a SIGSEGV */ if (user_mode(regs)) @@ -100,7 +101,8 @@ static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long a * of a task switch. */ index = pgd_index(addr); - pgd = (pgd_t *)pfn_to_virt(csr_read(CSR_SATP)) + index; + pfn = csr_read(CSR_SATP) & SATP_PPN; + pgd = (pgd_t *)pfn_to_virt(pfn) + index; pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) { -- cgit v1.2.3 From 635e3f3e47f24b2506bc9daf91d70ddf3cd024a9 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Mon, 2 Nov 2020 15:30:52 +0800 Subject: riscv: uaccess: fix __put_kernel_nofault() The copy_from_kernel_nofault() is broken on riscv because the 'dst' and 'src' are mistakenly reversed in __put_kernel_nofault() macro. copy_to_kernel_nofault: ... 0xffffffe0003159b8 <+30>: sd a4,0(a1) # a1 aka 'src' Fixes: d464118cdc ("riscv: implement __get_kernel_nofault and __put_user_nofault") Signed-off-by: Changbin Du Reviewed-by: Christoph Hellwig Reviewed-by: Anup Patel Tested-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index c47e6b35c551..824b2c9da75b 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -476,7 +476,7 @@ do { \ do { \ long __kr_err; \ \ - __put_user_nocheck(*((type *)(dst)), (type *)(src), __kr_err); \ + __put_user_nocheck(*((type *)(src)), (type *)(dst), __kr_err); \ if (unlikely(__kr_err)) \ goto err_label; \ } while (0) -- cgit v1.2.3 From 1074dd44c5ba377f90e2d0d99a784f73dbea6ff7 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 4 Nov 2020 12:07:13 +0530 Subject: RISC-V: Use non-PGD mappings for early DTB access Currently, we use PGD mappings for early DTB mapping in early_pgd but this breaks Linux kernel on SiFive Unleashed because on SiFive Unleashed PMP checks don't work correctly for PGD mappings. To fix early DTB mappings on SiFive Unleashed, we use non-PGD mappings (i.e. PMD) for early DTB access. Fixes: 8f3a2b4a96dc ("RISC-V: Move DT mapping outof fixmap") Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch') diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 42c14eb6bc43..8e577f14f120 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -293,6 +293,7 @@ pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss; #define NUM_EARLY_PMDS (1UL + MAX_EARLY_MAPPING_SIZE / PGDIR_SIZE) #endif pmd_t early_pmd[PTRS_PER_PMD * NUM_EARLY_PMDS] __initdata __aligned(PAGE_SIZE); +pmd_t early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) { @@ -490,6 +491,18 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) load_pa + (va - PAGE_OFFSET), map_size, PAGE_KERNEL_EXEC); +#ifndef __PAGETABLE_PMD_FOLDED + /* Setup early PMD for DTB */ + create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, + (uintptr_t)early_dtb_pmd, PGDIR_SIZE, PAGE_TABLE); + /* Create two consecutive PMD mappings for FDT early scan */ + pa = dtb_pa & ~(PMD_SIZE - 1); + create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, + pa, PMD_SIZE, PAGE_KERNEL); + create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA + PMD_SIZE, + pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); + dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1)); +#else /* Create two consecutive PGD mappings for FDT early scan */ pa = dtb_pa & ~(PGDIR_SIZE - 1); create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, @@ -497,6 +510,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA + PGDIR_SIZE, pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL); dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1)); +#endif dtb_early_pa = dtb_pa; /* -- cgit v1.2.3 From c2c81bb2f69138f902e1a58d3bef6ad97fb8a92c Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Fri, 23 Oct 2020 21:50:47 -0700 Subject: RISC-V: Fix the VDSO symbol generaton for binutils-2.35+ We were relying on GNU ld's ability to re-link executable files in order to extract our VDSO symbols. This behavior was deemed a bug as of binutils-2.35 (specifically the binutils-gdb commit a87e1817a4 ("Have the linker fail if any attempt to link in an executable is made."), but as that has been backported to at least Debian's binutils-2.34 in may manifest in other places. The previous version of this was a bit of a mess: we were linking a static executable version of the VDSO, containing only a subset of the input symbols, which we then linked into the kernel. This worked, but certainly wasn't a supported path through the toolchain. Instead this new version parses the textual output of nm to produce a symbol table. Both rely on near-zero addresses being linkable, but as we rely on weak undefined symbols being linkable elsewhere I don't view this as a major issue. Fixes: e2c0cdfba7f6 ("RISC-V: User-facing API") Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vdso/.gitignore | 1 + arch/riscv/kernel/vdso/Makefile | 18 +++++++++--------- arch/riscv/kernel/vdso/so2s.sh | 6 ++++++ 3 files changed, 16 insertions(+), 9 deletions(-) create mode 100755 arch/riscv/kernel/vdso/so2s.sh (limited to 'arch') diff --git a/arch/riscv/kernel/vdso/.gitignore b/arch/riscv/kernel/vdso/.gitignore index 11ebee9e4c1d..3a19def868ec 100644 --- a/arch/riscv/kernel/vdso/.gitignore +++ b/arch/riscv/kernel/vdso/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only vdso.lds *.tmp +vdso-syms.S diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 7d6a94d45ec9..cb8f9e4cfcbf 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -43,19 +43,14 @@ $(obj)/vdso.o: $(obj)/vdso.so SYSCFLAGS_vdso.so.dbg = $(c_flags) $(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold) +SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \ + -Wl,--build-id -Wl,--hash-style=both # We also create a special relocatable object that should mirror the symbol # table and layout of the linked DSO. With ld --just-symbols we can then # refer to these symbols in the kernel code rather than hand-coded addresses. - -SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \ - -Wl,--build-id=sha1 -Wl,--hash-style=both -$(obj)/vdso-dummy.o: $(src)/vdso.lds $(obj)/rt_sigreturn.o FORCE - $(call if_changed,vdsold) - -LDFLAGS_vdso-syms.o := -r --just-symbols -$(obj)/vdso-syms.o: $(obj)/vdso-dummy.o FORCE - $(call if_changed,ld) +$(obj)/vdso-syms.S: $(obj)/vdso.so FORCE + $(call if_changed,so2s) # strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S @@ -73,6 +68,11 @@ quiet_cmd_vdsold = VDSOLD $@ $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \ rm $@.tmp +# Extracts symbol offsets from the VDSO, converting them into an assembly file +# that contains the same symbols at the same offsets. +quiet_cmd_so2s = SO2S $@ + cmd_so2s = $(NM) -D $< | $(srctree)/$(src)/so2s.sh > $@ + # install commands for the unstripped file quiet_cmd_vdso_install = INSTALL $@ cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ diff --git a/arch/riscv/kernel/vdso/so2s.sh b/arch/riscv/kernel/vdso/so2s.sh new file mode 100755 index 000000000000..e64cb6d9440e --- /dev/null +++ b/arch/riscv/kernel/vdso/so2s.sh @@ -0,0 +1,6 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# Copyright 2020 Palmer Dabbelt + +sed 's!\([0-9a-f]*\) T \([a-z0-9_]*\)\(@@LINUX_4.15\)*!.global \2\n.set \2,0x\1!' \ +| grep '^\.' -- cgit v1.2.3 From 1aec69ae56be28b5fd3c9daead5f3840c30153c8 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 5 Nov 2020 16:27:39 -0600 Subject: x86/platform/uv: Fix missing OEM_TABLE_ID Testing shows a problem in that the OEM_TABLE_ID was missing for hubless systems. This is used to determine the APIC type (legacy or extended). Add the OEM_TABLE_ID to the early hubless processing. Fixes: 1e61f5a95f191 ("Add and decode Arch Type in UVsystab") Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201105222741.157029-2-mike.travis@hpe.com --- arch/x86/kernel/apic/x2apic_uv_x.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 714233cee0b5..a5794794ea59 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -366,7 +366,7 @@ static int __init early_get_arch_type(void) return ret; } -static int __init uv_set_system_type(char *_oem_id) +static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) { /* Save OEM_ID passed from ACPI MADT */ uv_stringify(sizeof(oem_id), oem_id, _oem_id); @@ -394,6 +394,9 @@ static int __init uv_set_system_type(char *_oem_id) else uv_hubless_system = 0x9; + /* Copy APIC type */ + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); + pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n", oem_id, oem_table_id, uv_system_type, uv_hubless_system); return 0; @@ -456,7 +459,7 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; /* If not UV, return. */ - if (likely(uv_set_system_type(_oem_id) == 0)) + if (uv_set_system_type(_oem_id, _oem_table_id) == 0) return 0; /* Save and Decode OEM Table ID */ -- cgit v1.2.3 From 1aee505e0171fc38fd5ed70c7f0dcbb7398c759f Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 5 Nov 2020 16:27:40 -0600 Subject: x86/platform/uv: Remove spaces from OEM IDs Testing shows that trailing spaces caused problems with the OEM_ID and the OEM_TABLE_ID. One being that the OEM_ID would not string compare correctly. Another the OEM_ID and OEM_TABLE_ID would be concatenated in the printout. Remove any trailing spaces. Fixes: 1e61f5a95f191 ("Add and decode Arch Type in UVsystab") Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201105222741.157029-3-mike.travis@hpe.com --- arch/x86/kernel/apic/x2apic_uv_x.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index a5794794ea59..0f848d6dddc9 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -290,6 +290,9 @@ static void __init uv_stringify(int len, char *to, char *from) { /* Relies on 'to' being NULL chars so result will be NULL terminated */ strncpy(to, from, len-1); + + /* Trim trailing spaces */ + (void)strim(to); } /* Find UV arch type entry in UVsystab */ -- cgit v1.2.3 From 801284f9737883a2b2639bd494455a72c82fdedf Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 5 Nov 2020 16:27:41 -0600 Subject: x86/platform/uv: Recognize UV5 hubless system identifier Testing shows a problem in that UV5 hubless systems were not being recognized. Add them to the list of OEM IDs checked. Fixes: 6c7794423a998 ("Add UV5 direct references") Signed-off-by: Mike Travis Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201105222741.157029-4-mike.travis@hpe.com --- arch/x86/kernel/apic/x2apic_uv_x.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 0f848d6dddc9..3115caa7d7d0 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -389,13 +389,20 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) /* (Not hubless), not a UV */ return 0; + /* Is UV hubless system */ + uv_hubless_system = 0x01; + + /* UV5 Hubless */ + if (strncmp(uv_archtype, "NSGI5", 5) == 0) + uv_hubless_system |= 0x20; + /* UV4 Hubless: CH */ - if (strncmp(uv_archtype, "NSGI4", 5) == 0) - uv_hubless_system = 0x11; + else if (strncmp(uv_archtype, "NSGI4", 5) == 0) + uv_hubless_system |= 0x10; /* UV3 Hubless: UV300/MC990X w/o hub */ else - uv_hubless_system = 0x9; + uv_hubless_system |= 0x8; /* Copy APIC type */ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); -- cgit v1.2.3