From 584ea6564bcaead223840cdafe84e4947ba85509 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 18 Aug 2023 12:41:35 -0700 Subject: RISC-V: Probe for unaligned access speed Rather than deferring unaligned access speed determinations to a vendor function, let's probe them and find out how fast they are. If we determine that an unaligned word access is faster than N byte accesses, mark the hardware's unaligned access as "fast". Otherwise, we mark accesses as slow. The algorithm itself runs for a fixed amount of jiffies. Within each iteration it attempts to time a single loop, and then keeps only the best (fastest) loop it saw. This algorithm was found to have lower variance from run to run than my first attempt, which counted the total number of iterations that could be done in that fixed amount of jiffies. By taking only the best iteration in the loop, assuming at least one loop wasn't perturbed by an interrupt, we eliminate the effects of interrupts and other "warm up" factors like branch prediction. The only downside is it depends on having an rdtime granular and accurate enough to measure a single copy. If we ever manage to complete a loop in 0 rdtime ticks, we leave the unaligned setting at UNKNOWN. There is a slight change in user-visible behavior here. Previously, all boards except the THead C906 reported misaligned access speed of UNKNOWN. C906 reported FAST. With this change, since we're now measuring misaligned access speed on each hart, all RISC-V systems will have this key set as either FAST or SLOW. Currently, we don't have a way to confidently measure the difference between SLOW and EMULATED, so we label anything not fast as SLOW. This will mislabel some systems that are actually EMULATED as SLOW. When we get support for delegating misaligned access traps to the kernel (as opposed to the firmware quietly handling it), we can explicitly test in Linux to see if unaligned accesses trap. Those systems will start to report EMULATED, though older (today's) systems without that new SBI mechanism will continue to report SLOW. I've updated the documentation for those hwprobe values to reflect this, specifically: SLOW may or may not be emulated by software, and FAST represents means being faster than equivalent byte accesses. The change in documentation is accurate with respect to both the former and current behavior. Signed-off-by: Evan Green Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20230818194136.4084400-2-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpufeature.h | 2 + arch/riscv/kernel/Makefile | 1 + arch/riscv/kernel/copy-unaligned.S | 71 ++++++++++++++++++++++++ arch/riscv/kernel/copy-unaligned.h | 13 +++++ arch/riscv/kernel/cpufeature.c | 104 ++++++++++++++++++++++++++++++++++++ arch/riscv/kernel/smpboot.c | 2 + 6 files changed, 193 insertions(+) create mode 100644 arch/riscv/kernel/copy-unaligned.S create mode 100644 arch/riscv/kernel/copy-unaligned.h (limited to 'arch/riscv') diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 23fed53b8815..d0345bd659c9 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -30,4 +30,6 @@ DECLARE_PER_CPU(long, misaligned_access_speed); /* Per-cpu ISA extensions. */ extern struct riscv_isainfo hart_isa[NR_CPUS]; +void check_unaligned_access(int cpu); + #endif diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 506cc4a9a45a..7e6c464cdfe9 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -38,6 +38,7 @@ extra-y += vmlinux.lds obj-y += head.o obj-y += soc.o obj-$(CONFIG_RISCV_ALTERNATIVE) += alternative.o +obj-y += copy-unaligned.o obj-y += cpu.o obj-y += cpufeature.o obj-y += entry.o diff --git a/arch/riscv/kernel/copy-unaligned.S b/arch/riscv/kernel/copy-unaligned.S new file mode 100644 index 000000000000..cfdecfbaad62 --- /dev/null +++ b/arch/riscv/kernel/copy-unaligned.S @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2023 Rivos Inc. */ + +#include +#include + + .text + +/* void __riscv_copy_words_unaligned(void *, const void *, size_t) */ +/* Performs a memcpy without aligning buffers, using word loads and stores. */ +/* Note: The size is truncated to a multiple of 8 * SZREG */ +ENTRY(__riscv_copy_words_unaligned) + andi a4, a2, ~((8*SZREG)-1) + beqz a4, 2f + add a3, a1, a4 +1: + REG_L a4, 0(a1) + REG_L a5, SZREG(a1) + REG_L a6, 2*SZREG(a1) + REG_L a7, 3*SZREG(a1) + REG_L t0, 4*SZREG(a1) + REG_L t1, 5*SZREG(a1) + REG_L t2, 6*SZREG(a1) + REG_L t3, 7*SZREG(a1) + REG_S a4, 0(a0) + REG_S a5, SZREG(a0) + REG_S a6, 2*SZREG(a0) + REG_S a7, 3*SZREG(a0) + REG_S t0, 4*SZREG(a0) + REG_S t1, 5*SZREG(a0) + REG_S t2, 6*SZREG(a0) + REG_S t3, 7*SZREG(a0) + addi a0, a0, 8*SZREG + addi a1, a1, 8*SZREG + bltu a1, a3, 1b + +2: + ret +END(__riscv_copy_words_unaligned) + +/* void __riscv_copy_bytes_unaligned(void *, const void *, size_t) */ +/* Performs a memcpy without aligning buffers, using only byte accesses. */ +/* Note: The size is truncated to a multiple of 8 */ +ENTRY(__riscv_copy_bytes_unaligned) + andi a4, a2, ~(8-1) + beqz a4, 2f + add a3, a1, a4 +1: + lb a4, 0(a1) + lb a5, 1(a1) + lb a6, 2(a1) + lb a7, 3(a1) + lb t0, 4(a1) + lb t1, 5(a1) + lb t2, 6(a1) + lb t3, 7(a1) + sb a4, 0(a0) + sb a5, 1(a0) + sb a6, 2(a0) + sb a7, 3(a0) + sb t0, 4(a0) + sb t1, 5(a0) + sb t2, 6(a0) + sb t3, 7(a0) + addi a0, a0, 8 + addi a1, a1, 8 + bltu a1, a3, 1b + +2: + ret +END(__riscv_copy_bytes_unaligned) diff --git a/arch/riscv/kernel/copy-unaligned.h b/arch/riscv/kernel/copy-unaligned.h new file mode 100644 index 000000000000..e3d70d35b708 --- /dev/null +++ b/arch/riscv/kernel/copy-unaligned.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Rivos, Inc. + */ +#ifndef __RISCV_KERNEL_COPY_UNALIGNED_H +#define __RISCV_KERNEL_COPY_UNALIGNED_H + +#include + +void __riscv_copy_words_unaligned(void *dst, const void *src, size_t size); +void __riscv_copy_bytes_unaligned(void *dst, const void *src, size_t size); + +#endif /* __RISCV_KERNEL_COPY_UNALIGNED_H */ diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index bdcf460ea53d..cc09444352b2 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -19,12 +19,19 @@ #include #include #include +#include #include #include #include +#include "copy-unaligned.h" + #define NUM_ALPHA_EXTS ('z' - 'a' + 1) +#define MISALIGNED_ACCESS_JIFFIES_LG2 1 +#define MISALIGNED_BUFFER_SIZE 0x4000 +#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) + unsigned long elf_hwcap __read_mostly; /* Host ISA bitmap */ @@ -396,6 +403,103 @@ unsigned long riscv_get_elf_hwcap(void) return hwcap; } +void check_unaligned_access(int cpu) +{ + u64 start_cycles, end_cycles; + u64 word_cycles; + u64 byte_cycles; + int ratio; + unsigned long start_jiffies, now; + struct page *page; + void *dst; + void *src; + long speed = RISCV_HWPROBE_MISALIGNED_SLOW; + + page = alloc_pages(GFP_NOWAIT, get_order(MISALIGNED_BUFFER_SIZE)); + if (!page) { + pr_warn("Can't alloc pages to measure memcpy performance"); + return; + } + + /* Make an unaligned destination buffer. */ + dst = (void *)((unsigned long)page_address(page) | 0x1); + /* Unalign src as well, but differently (off by 1 + 2 = 3). */ + src = dst + (MISALIGNED_BUFFER_SIZE / 2); + src += 2; + word_cycles = -1ULL; + /* Do a warmup. */ + __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); + preempt_disable(); + start_jiffies = jiffies; + while ((now = jiffies) == start_jiffies) + cpu_relax(); + + /* + * For a fixed amount of time, repeatedly try the function, and take + * the best time in cycles as the measurement. + */ + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { + start_cycles = get_cycles64(); + /* Ensure the CSR read can't reorder WRT to the copy. */ + mb(); + __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); + /* Ensure the copy ends before the end time is snapped. */ + mb(); + end_cycles = get_cycles64(); + if ((end_cycles - start_cycles) < word_cycles) + word_cycles = end_cycles - start_cycles; + } + + byte_cycles = -1ULL; + __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); + start_jiffies = jiffies; + while ((now = jiffies) == start_jiffies) + cpu_relax(); + + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { + start_cycles = get_cycles64(); + mb(); + __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); + mb(); + end_cycles = get_cycles64(); + if ((end_cycles - start_cycles) < byte_cycles) + byte_cycles = end_cycles - start_cycles; + } + + preempt_enable(); + + /* Don't divide by zero. */ + if (!word_cycles || !byte_cycles) { + pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n", + cpu); + + goto out; + } + + if (word_cycles < byte_cycles) + speed = RISCV_HWPROBE_MISALIGNED_FAST; + + ratio = div_u64((byte_cycles * 100), word_cycles); + pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n", + cpu, + ratio / 100, + ratio % 100, + (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow"); + + per_cpu(misaligned_access_speed, cpu) = speed; + +out: + __free_pages(page, get_order(MISALIGNED_BUFFER_SIZE)); +} + +static int check_unaligned_access_boot_cpu(void) +{ + check_unaligned_access(0); + return 0; +} + +arch_initcall(check_unaligned_access_boot_cpu); + #ifdef CONFIG_RISCV_ALTERNATIVE /* * Alternative patch sites consider 48 bits when determining when to patch diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index f4d6acb38dd0..00ddbd2364dc 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -245,6 +246,7 @@ asmlinkage __visible void smp_callin(void) numa_add_cpu(curr_cpuid); set_cpu_online(curr_cpuid, 1); + check_unaligned_access(curr_cpuid); probe_vendor_features(curr_cpuid); if (has_vector()) { -- cgit v1.2.3 From f2d14bc4e437b8ed21e6890ae047a6ec47c030d9 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 18 Aug 2023 12:41:36 -0700 Subject: RISC-V: alternative: Remove feature_probe_func Now that we're testing unaligned memory copy and making that determination generically, there are no more users of the vendor feature_probe_func(). While I think it's probably going to need to come back, there are no users right now, so let's remove it until it's needed. Signed-off-by: Evan Green Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230818194136.4084400-3-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/errata/thead/errata.c | 8 -------- arch/riscv/include/asm/alternative.h | 5 ----- arch/riscv/kernel/alternative.c | 19 ------------------- arch/riscv/kernel/smpboot.c | 1 - 4 files changed, 33 deletions(-) (limited to 'arch/riscv') diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index be84b14f0118..0554ed4bf087 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -120,11 +120,3 @@ void thead_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) local_flush_icache_all(); } - -void thead_feature_probe_func(unsigned int cpu, - unsigned long archid, - unsigned long impid) -{ - if ((archid == 0) && (impid == 0)) - per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST; -} diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index 6a41537826a7..58ccd2f8cab7 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -30,7 +30,6 @@ #define ALT_OLD_PTR(a) __ALT_PTR(a, old_offset) #define ALT_ALT_PTR(a) __ALT_PTR(a, alt_offset) -void probe_vendor_features(unsigned int cpu); void __init apply_boot_alternatives(void); void __init apply_early_boot_alternatives(void); void apply_module_alternatives(void *start, size_t length); @@ -53,15 +52,11 @@ void thead_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); -void thead_feature_probe_func(unsigned int cpu, unsigned long archid, - unsigned long impid); - void riscv_cpufeature_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned int stage); #else /* CONFIG_RISCV_ALTERNATIVE */ -static inline void probe_vendor_features(unsigned int cpu) { } static inline void apply_boot_alternatives(void) { } static inline void apply_early_boot_alternatives(void) { } static inline void apply_module_alternatives(void *start, size_t length) { } diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c index 6b75788c18e6..85056153fa23 100644 --- a/arch/riscv/kernel/alternative.c +++ b/arch/riscv/kernel/alternative.c @@ -27,8 +27,6 @@ struct cpu_manufacturer_info_t { void (*patch_func)(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); - void (*feature_probe_func)(unsigned int cpu, unsigned long archid, - unsigned long impid); }; static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info) @@ -43,7 +41,6 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info cpu_mfr_info->imp_id = sbi_get_mimpid(); #endif - cpu_mfr_info->feature_probe_func = NULL; switch (cpu_mfr_info->vendor_id) { #ifdef CONFIG_ERRATA_SIFIVE case SIFIVE_VENDOR_ID: @@ -53,7 +50,6 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info #ifdef CONFIG_ERRATA_THEAD case THEAD_VENDOR_ID: cpu_mfr_info->patch_func = thead_errata_patch_func; - cpu_mfr_info->feature_probe_func = thead_feature_probe_func; break; #endif default: @@ -143,20 +139,6 @@ void riscv_alternative_fix_offsets(void *alt_ptr, unsigned int len, } } -/* Called on each CPU as it starts */ -void probe_vendor_features(unsigned int cpu) -{ - struct cpu_manufacturer_info_t cpu_mfr_info; - - riscv_fill_cpu_mfr_info(&cpu_mfr_info); - if (!cpu_mfr_info.feature_probe_func) - return; - - cpu_mfr_info.feature_probe_func(cpu, - cpu_mfr_info.arch_id, - cpu_mfr_info.imp_id); -} - /* * This is called very early in the boot process (directly after we run * a feature detect on the boot CPU). No need to worry about other CPUs @@ -211,7 +193,6 @@ void __init apply_boot_alternatives(void) /* If called on non-boot cpu things could go wrong */ WARN_ON(smp_processor_id() != 0); - probe_vendor_features(0); _apply_alternatives((struct alt_entry *)__alt_start, (struct alt_entry *)__alt_end, RISCV_ALTERNATIVES_BOOT); diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 00ddbd2364dc..1b8da4e40a4d 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -247,7 +247,6 @@ asmlinkage __visible void smp_callin(void) numa_add_cpu(curr_cpuid); set_cpu_online(curr_cpuid, 1); check_unaligned_access(curr_cpuid); - probe_vendor_features(curr_cpuid); if (has_vector()) { if (riscv_v_setup_vsize()) -- cgit v1.2.3 From eb746180132a555da8cfb02d98cb6d0a9d58e870 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 17 Aug 2023 00:23:34 +0100 Subject: riscv: dma-mapping: only invalidate after DMA, not flush No other architecture intentionally writes back dirty cache lines into a buffer that a device has just finished writing into. If the cache is clean, this has no effect at all, but if a cacheline in the buffer has actually been written by the CPU, there is a driver bug that is likely made worse by overwriting that buffer. Signed-off-by: Arnd Bergmann Reviewed-by: Conor Dooley Reviewed-by: Lad Prabhakar Acked-by: Palmer Dabbelt Signed-off-by: Lad Prabhakar Link: https://lore.kernel.org/r/20230816232336.164413-2-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/dma-noncoherent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/riscv') diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c index d51a75864e53..94614cf61cdd 100644 --- a/arch/riscv/mm/dma-noncoherent.c +++ b/arch/riscv/mm/dma-noncoherent.c @@ -42,7 +42,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, break; case DMA_FROM_DEVICE: case DMA_BIDIRECTIONAL: - ALT_CMO_OP(flush, vaddr, size, riscv_cbom_block_size); + ALT_CMO_OP(inval, vaddr, size, riscv_cbom_block_size); break; default: break; -- cgit v1.2.3 From 482069ebdc1d61fba14527055894b9f4f0ced08c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 17 Aug 2023 00:23:35 +0100 Subject: riscv: dma-mapping: skip invalidation before bidirectional DMA For a DMA_BIDIRECTIONAL transfer, the caches have to be cleaned first to let the device see data written by the CPU, and invalidated after the transfer to let the CPU see data written by the device. riscv also invalidates the caches before the transfer, which does not appear to serve any purpose. Signed-off-by: Arnd Bergmann Reviewed-by: Conor Dooley Reviewed-by: Lad Prabhakar Acked-by: Palmer Dabbelt Acked-by: Guo Ren Signed-off-by: Lad Prabhakar Link: https://lore.kernel.org/r/20230816232336.164413-3-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/dma-noncoherent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/riscv') diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c index 94614cf61cdd..fc6377a64c8d 100644 --- a/arch/riscv/mm/dma-noncoherent.c +++ b/arch/riscv/mm/dma-noncoherent.c @@ -25,7 +25,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, ALT_CMO_OP(clean, vaddr, size, riscv_cbom_block_size); break; case DMA_BIDIRECTIONAL: - ALT_CMO_OP(flush, vaddr, size, riscv_cbom_block_size); + ALT_CMO_OP(clean, vaddr, size, riscv_cbom_block_size); break; default: break; -- cgit v1.2.3 From 935730160738a478dbf4b61cf0cfc29c1442fb4e Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 17 Aug 2023 00:23:36 +0100 Subject: riscv: dma-mapping: switch over to generic implementation Add helper functions for cache wback/inval/clean and use them arch_sync_dma_for_device()/arch_sync_dma_for_cpu() functions. The proposed changes are in preparation for switching over to generic implementation. Reorganization of the code is based on the patch (Link[0]) from Arnd. For now I have dropped CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU check as this will be enabled by default upon selection of RISCV_DMA_NONCOHERENT and also dropped arch_dma_mark_dcache_clean(). Link[0]: https://lore.kernel.org/all/20230327121317.4081816-22-arnd@kernel.org/ Signed-off-by: Arnd Bergmann Signed-off-by: Lad Prabhakar Link: https://lore.kernel.org/r/20230816232336.164413-4-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/dma-noncoherent.c | 60 ++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 9 deletions(-) (limited to 'arch/riscv') diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c index fc6377a64c8d..06b8fea58e20 100644 --- a/arch/riscv/mm/dma-noncoherent.c +++ b/arch/riscv/mm/dma-noncoherent.c @@ -12,21 +12,61 @@ static bool noncoherent_supported __ro_after_init; -void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, - enum dma_data_direction dir) +static inline void arch_dma_cache_wback(phys_addr_t paddr, size_t size) +{ + void *vaddr = phys_to_virt(paddr); + + ALT_CMO_OP(clean, vaddr, size, riscv_cbom_block_size); +} + +static inline void arch_dma_cache_inv(phys_addr_t paddr, size_t size) +{ + void *vaddr = phys_to_virt(paddr); + + ALT_CMO_OP(inval, vaddr, size, riscv_cbom_block_size); +} + +static inline void arch_dma_cache_wback_inv(phys_addr_t paddr, size_t size) { void *vaddr = phys_to_virt(paddr); + ALT_CMO_OP(flush, vaddr, size, riscv_cbom_block_size); +} + +static inline bool arch_sync_dma_clean_before_fromdevice(void) +{ + return true; +} + +static inline bool arch_sync_dma_cpu_needs_post_dma_flush(void) +{ + return true; +} + +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) +{ switch (dir) { case DMA_TO_DEVICE: - ALT_CMO_OP(clean, vaddr, size, riscv_cbom_block_size); + arch_dma_cache_wback(paddr, size); break; + case DMA_FROM_DEVICE: - ALT_CMO_OP(clean, vaddr, size, riscv_cbom_block_size); - break; + if (!arch_sync_dma_clean_before_fromdevice()) { + arch_dma_cache_inv(paddr, size); + break; + } + fallthrough; + case DMA_BIDIRECTIONAL: - ALT_CMO_OP(clean, vaddr, size, riscv_cbom_block_size); + /* Skip the invalidate here if it's done later */ + if (IS_ENABLED(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) && + arch_sync_dma_cpu_needs_post_dma_flush()) + arch_dma_cache_wback(paddr, size); + else + arch_dma_cache_wback_inv(paddr, size); break; + default: break; } @@ -35,15 +75,17 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, enum dma_data_direction dir) { - void *vaddr = phys_to_virt(paddr); - switch (dir) { case DMA_TO_DEVICE: break; + case DMA_FROM_DEVICE: case DMA_BIDIRECTIONAL: - ALT_CMO_OP(inval, vaddr, size, riscv_cbom_block_size); + /* FROM_DEVICE invalidate needed if speculative CPU prefetch only */ + if (arch_sync_dma_cpu_needs_post_dma_flush()) + arch_dma_cache_inv(paddr, size); break; + default: break; } -- cgit v1.2.3 From d6ca3a56f4f3e1d408397f1e309f7c57a6d01c38 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Fri, 18 Aug 2023 14:57:18 +0100 Subject: riscv: asm: vendorid_list: Add Andes Technology to the vendors list Add Andes Technology to the vendors list. Signed-off-by: Lad Prabhakar Reviewed-by: Heiko Stuebner Reviewed-by: Conor Dooley Reviewed-by: Geert Uytterhoeven Tested-by: Conor Dooley # tyre-kicking on a d1 Link: https://lore.kernel.org/r/20230818135723.80612-2-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/vendorid_list.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/riscv') diff --git a/arch/riscv/include/asm/vendorid_list.h b/arch/riscv/include/asm/vendorid_list.h index cb89af3f0704..e55407ace0c3 100644 --- a/arch/riscv/include/asm/vendorid_list.h +++ b/arch/riscv/include/asm/vendorid_list.h @@ -5,6 +5,7 @@ #ifndef ASM_VENDOR_LIST_H #define ASM_VENDOR_LIST_H +#define ANDESTECH_VENDOR_ID 0x31e #define SIFIVE_VENDOR_ID 0x489 #define THEAD_VENDOR_ID 0x5b7 -- cgit v1.2.3 From e021ae7f5145d46ab64cb058cbffda31059f37e5 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Fri, 18 Aug 2023 14:57:19 +0100 Subject: riscv: errata: Add Andes alternative ports Add required ports of the Alternative scheme for Andes CPU cores. I/O Coherence Port (IOCP) provides an AXI interface for connecting external non-caching masters, such as DMA controllers. IOCP is a specification option and is disabled on the Renesas RZ/Five SoC due to this reason cache management needs a software workaround. Signed-off-by: Lad Prabhakar Reviewed-by: Conor Dooley Tested-by: Conor Dooley # tyre-kicking on a d1 Link: https://lore.kernel.org/r/20230818135723.80612-3-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.errata | 21 ++++++++++++ arch/riscv/errata/Makefile | 1 + arch/riscv/errata/andes/Makefile | 1 + arch/riscv/errata/andes/errata.c | 66 ++++++++++++++++++++++++++++++++++++ arch/riscv/include/asm/alternative.h | 3 ++ arch/riscv/include/asm/errata_list.h | 5 +++ arch/riscv/kernel/alternative.c | 5 +++ 7 files changed, 102 insertions(+) create mode 100644 arch/riscv/errata/andes/Makefile create mode 100644 arch/riscv/errata/andes/errata.c (limited to 'arch/riscv') diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata index 0c8f4652cd82..92c779764b27 100644 --- a/arch/riscv/Kconfig.errata +++ b/arch/riscv/Kconfig.errata @@ -1,5 +1,26 @@ menu "CPU errata selection" +config ERRATA_ANDES + bool "Andes AX45MP errata" + depends on RISCV_ALTERNATIVE + help + All Andes errata Kconfig depend on this Kconfig. Disabling + this Kconfig will disable all Andes errata. Please say "Y" + here if your platform uses Andes CPU cores. + + Otherwise, please say "N" here to avoid unnecessary overhead. + +config ERRATA_ANDES_CMO + bool "Apply Andes cache management errata" + depends on ERRATA_ANDES && MMU && ARCH_R9A07G043 + select RISCV_DMA_NONCOHERENT + default y + help + This will apply the cache management errata to handle the + non-standard handling on non-coherent operations on Andes cores. + + If you don't know what to do here, say "Y". + config ERRATA_SIFIVE bool "SiFive errata" depends on RISCV_ALTERNATIVE diff --git a/arch/riscv/errata/Makefile b/arch/riscv/errata/Makefile index 7b2637c8c332..8a2739485123 100644 --- a/arch/riscv/errata/Makefile +++ b/arch/riscv/errata/Makefile @@ -2,5 +2,6 @@ ifdef CONFIG_RELOCATABLE KBUILD_CFLAGS += -fno-pie endif +obj-$(CONFIG_ERRATA_ANDES) += andes/ obj-$(CONFIG_ERRATA_SIFIVE) += sifive/ obj-$(CONFIG_ERRATA_THEAD) += thead/ diff --git a/arch/riscv/errata/andes/Makefile b/arch/riscv/errata/andes/Makefile new file mode 100644 index 000000000000..2d644e19caef --- /dev/null +++ b/arch/riscv/errata/andes/Makefile @@ -0,0 +1 @@ +obj-y += errata.o diff --git a/arch/riscv/errata/andes/errata.c b/arch/riscv/errata/andes/errata.c new file mode 100644 index 000000000000..197db68cc8da --- /dev/null +++ b/arch/riscv/errata/andes/errata.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Erratas to be applied for Andes CPU cores + * + * Copyright (C) 2023 Renesas Electronics Corporation. + * + * Author: Lad Prabhakar + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define ANDESTECH_AX45MP_MARCHID 0x8000000000008a45UL +#define ANDESTECH_AX45MP_MIMPID 0x500UL +#define ANDESTECH_SBI_EXT_ANDES 0x0900031E + +#define ANDES_SBI_EXT_IOCP_SW_WORKAROUND 1 + +static long ax45mp_iocp_sw_workaround(void) +{ + struct sbiret ret; + + /* + * ANDES_SBI_EXT_IOCP_SW_WORKAROUND SBI EXT checks if the IOCP is missing and + * cache is controllable only then CMO will be applied to the platform. + */ + ret = sbi_ecall(ANDESTECH_SBI_EXT_ANDES, ANDES_SBI_EXT_IOCP_SW_WORKAROUND, + 0, 0, 0, 0, 0, 0); + + return ret.error ? 0 : ret.value; +} + +static bool errata_probe_iocp(unsigned int stage, unsigned long arch_id, unsigned long impid) +{ + if (!IS_ENABLED(CONFIG_ERRATA_ANDES_CMO)) + return false; + + if (arch_id != ANDESTECH_AX45MP_MARCHID || impid != ANDESTECH_AX45MP_MIMPID) + return false; + + if (!ax45mp_iocp_sw_workaround()) + return false; + + /* Set this just to make core cbo code happy */ + riscv_cbom_block_size = 1; + riscv_noncoherent_supported(); + + return true; +} + +void __init_or_module andes_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, + unsigned long archid, unsigned long impid, + unsigned int stage) +{ + errata_probe_iocp(stage, archid, impid); + + /* we have nothing to patch here ATM so just return back */ +} diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index 6a41537826a7..f6cfca939c92 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -46,6 +46,9 @@ struct alt_entry { u32 patch_id; /* The patch ID (erratum ID or cpufeature ID) */ }; +void andes_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, + unsigned long archid, unsigned long impid, + unsigned int stage); void sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index fb1a810f3d8c..e2ecd01bfac7 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -11,6 +11,11 @@ #include #include +#ifdef CONFIG_ERRATA_ANDES +#define ERRATA_ANDESTECH_NO_IOCP 0 +#define ERRATA_ANDESTECH_NUMBER 1 +#endif + #ifdef CONFIG_ERRATA_SIFIVE #define ERRATA_SIFIVE_CIP_453 0 #define ERRATA_SIFIVE_CIP_1200 1 diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c index 6b75788c18e6..b0345992a35e 100644 --- a/arch/riscv/kernel/alternative.c +++ b/arch/riscv/kernel/alternative.c @@ -45,6 +45,11 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info cpu_mfr_info->feature_probe_func = NULL; switch (cpu_mfr_info->vendor_id) { +#ifdef CONFIG_ERRATA_ANDES + case ANDESTECH_VENDOR_ID: + cpu_mfr_info->patch_func = andes_errata_patch_func; + break; +#endif #ifdef CONFIG_ERRATA_SIFIVE case SIFIVE_VENDOR_ID: cpu_mfr_info->patch_func = sifive_errata_patch_func; -- cgit v1.2.3 From b79f300c1fd4bd83b1f827c7a0e043fca7aad73c Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Fri, 18 Aug 2023 14:57:20 +0100 Subject: riscv: mm: dma-noncoherent: nonstandard cache operations support Introduce support for nonstandard noncoherent systems in the RISC-V architecture. It enables function pointer support to handle cache management in such systems. This patch adds a new configuration option called "RISCV_NONSTANDARD_CACHE_OPS." This option is a boolean flag that depends on "RISCV_DMA_NONCOHERENT" and enables the function pointer support for cache management in nonstandard noncoherent systems. Signed-off-by: Lad Prabhakar Reviewed-by: Conor Dooley Tested-by: Conor Dooley # tyre-kicking on a d1 Reviewed-by: Emil Renner Berthing Tested-by: Emil Renner Berthing # Link: https://lore.kernel.org/r/20230818135723.80612-4-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 7 ++++++ arch/riscv/include/asm/dma-noncoherent.h | 28 +++++++++++++++++++++ arch/riscv/mm/dma-noncoherent.c | 43 ++++++++++++++++++++++++++++++++ arch/riscv/mm/pmem.c | 13 ++++++++++ 4 files changed, 91 insertions(+) create mode 100644 arch/riscv/include/asm/dma-noncoherent.h (limited to 'arch/riscv') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4c07b9189c86..3c2087974ce0 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -269,6 +269,13 @@ config RISCV_DMA_NONCOHERENT select ARCH_HAS_SYNC_DMA_FOR_DEVICE select DMA_DIRECT_REMAP +config RISCV_NONSTANDARD_CACHE_OPS + bool + depends on RISCV_DMA_NONCOHERENT + help + This enables function pointer support for non-standard noncoherent + systems to handle cache management. + config AS_HAS_INSN def_bool $(as-instr,.insn r 51$(comma) 0$(comma) 0$(comma) t0$(comma) t0$(comma) zero) diff --git a/arch/riscv/include/asm/dma-noncoherent.h b/arch/riscv/include/asm/dma-noncoherent.h new file mode 100644 index 000000000000..312cfa0858fb --- /dev/null +++ b/arch/riscv/include/asm/dma-noncoherent.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 Renesas Electronics Corp. + */ + +#ifndef __ASM_DMA_NONCOHERENT_H +#define __ASM_DMA_NONCOHERENT_H + +#include + +/* + * struct riscv_nonstd_cache_ops - Structure for non-standard CMO function pointers + * + * @wback: Function pointer for cache writeback + * @inv: Function pointer for invalidating cache + * @wback_inv: Function pointer for flushing the cache (writeback + invalidating) + */ +struct riscv_nonstd_cache_ops { + void (*wback)(phys_addr_t paddr, size_t size); + void (*inv)(phys_addr_t paddr, size_t size); + void (*wback_inv)(phys_addr_t paddr, size_t size); +}; + +extern struct riscv_nonstd_cache_ops noncoherent_cache_ops; + +void riscv_noncoherent_register_cache_ops(const struct riscv_nonstd_cache_ops *ops); + +#endif /* __ASM_DMA_NONCOHERENT_H */ diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c index 06b8fea58e20..a4f3f37859ae 100644 --- a/arch/riscv/mm/dma-noncoherent.c +++ b/arch/riscv/mm/dma-noncoherent.c @@ -9,13 +9,26 @@ #include #include #include +#include static bool noncoherent_supported __ro_after_init; +struct riscv_nonstd_cache_ops noncoherent_cache_ops __ro_after_init = { + .wback = NULL, + .inv = NULL, + .wback_inv = NULL, +}; + static inline void arch_dma_cache_wback(phys_addr_t paddr, size_t size) { void *vaddr = phys_to_virt(paddr); +#ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS + if (unlikely(noncoherent_cache_ops.wback)) { + noncoherent_cache_ops.wback(paddr, size); + return; + } +#endif ALT_CMO_OP(clean, vaddr, size, riscv_cbom_block_size); } @@ -23,6 +36,13 @@ static inline void arch_dma_cache_inv(phys_addr_t paddr, size_t size) { void *vaddr = phys_to_virt(paddr); +#ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS + if (unlikely(noncoherent_cache_ops.inv)) { + noncoherent_cache_ops.inv(paddr, size); + return; + } +#endif + ALT_CMO_OP(inval, vaddr, size, riscv_cbom_block_size); } @@ -30,6 +50,13 @@ static inline void arch_dma_cache_wback_inv(phys_addr_t paddr, size_t size) { void *vaddr = phys_to_virt(paddr); +#ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS + if (unlikely(noncoherent_cache_ops.wback_inv)) { + noncoherent_cache_ops.wback_inv(paddr, size); + return; + } +#endif + ALT_CMO_OP(flush, vaddr, size, riscv_cbom_block_size); } @@ -95,6 +122,13 @@ void arch_dma_prep_coherent(struct page *page, size_t size) { void *flush_addr = page_address(page); +#ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS + if (unlikely(noncoherent_cache_ops.wback_inv)) { + noncoherent_cache_ops.wback_inv(page_to_phys(page), size); + return; + } +#endif + ALT_CMO_OP(flush, flush_addr, size, riscv_cbom_block_size); } @@ -120,3 +154,12 @@ void riscv_noncoherent_supported(void) "Non-coherent DMA support enabled without a block size\n"); noncoherent_supported = true; } + +void riscv_noncoherent_register_cache_ops(const struct riscv_nonstd_cache_ops *ops) +{ + if (!ops) + return; + + noncoherent_cache_ops = *ops; +} +EXPORT_SYMBOL_GPL(riscv_noncoherent_register_cache_ops); diff --git a/arch/riscv/mm/pmem.c b/arch/riscv/mm/pmem.c index 089df92ae876..c5fc5ec96f6d 100644 --- a/arch/riscv/mm/pmem.c +++ b/arch/riscv/mm/pmem.c @@ -7,15 +7,28 @@ #include #include +#include void arch_wb_cache_pmem(void *addr, size_t size) { +#ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS + if (unlikely(noncoherent_cache_ops.wback)) { + noncoherent_cache_ops.wback(virt_to_phys(addr), size); + return; + } +#endif ALT_CMO_OP(clean, addr, size, riscv_cbom_block_size); } EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); void arch_invalidate_pmem(void *addr, size_t size) { +#ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS + if (unlikely(noncoherent_cache_ops.inv)) { + noncoherent_cache_ops.inv(virt_to_phys(addr), size); + return; + } +#endif ALT_CMO_OP(inval, addr, size, riscv_cbom_block_size); } EXPORT_SYMBOL_GPL(arch_invalidate_pmem); -- cgit v1.2.3 From 9300f00439743c4a34d735e1a27118eb68a1504e Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Fri, 25 Aug 2023 05:02:46 +0000 Subject: RISC-V: Add ptrace support for vectors This patch add back the ptrace support with the following fix: - Define NT_RISCV_CSR and re-number NT_RISCV_VECTOR to prevent conflicting with gdb's NT_RISCV_CSR. - Use struct __riscv_v_regset_state to handle ptrace requests Since gdb does not directly include the note description header in Linux and has already defined NT_RISCV_CSR as 0x900, we decide to sync with gdb and renumber NT_RISCV_VECTOR to solve and prevent future conflicts. Fixes: 0c59922c769a ("riscv: Add ptrace vector support") Signed-off-by: Andy Chiu Link: https://lore.kernel.org/r/20230825050248.32681-1-andy.chiu@sifive.com [Palmer: Drop the unused "size" variable in riscv_vr_set().] Signed-off-by: Palmer Dabbelt --- arch/riscv/include/uapi/asm/ptrace.h | 13 ++++-- arch/riscv/kernel/ptrace.c | 79 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/elf.h | 2 + 3 files changed, 90 insertions(+), 4 deletions(-) (limited to 'arch/riscv') diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h index 283800130614..575e95bb1bc3 100644 --- a/arch/riscv/include/uapi/asm/ptrace.h +++ b/arch/riscv/include/uapi/asm/ptrace.h @@ -103,13 +103,18 @@ struct __riscv_v_ext_state { * In signal handler, datap will be set a correct user stack offset * and vector registers will be copied to the address of datap * pointer. - * - * In ptrace syscall, datap will be set to zero and the vector - * registers will be copied to the address right after this - * structure. */ }; +struct __riscv_v_regset_state { + unsigned long vstart; + unsigned long vl; + unsigned long vtype; + unsigned long vcsr; + unsigned long vlenb; + char vreg[]; +}; + /* * According to spec: The number of bits in a single vector register, * VLEN >= ELEN, which must be a power of 2, and must be no greater than diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 487303e3ef22..2afe460de16a 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -25,6 +25,9 @@ enum riscv_regset { #ifdef CONFIG_FPU REGSET_F, #endif +#ifdef CONFIG_RISCV_ISA_V + REGSET_V, +#endif }; static int riscv_gpr_get(struct task_struct *target, @@ -81,6 +84,71 @@ static int riscv_fpr_set(struct task_struct *target, } #endif +#ifdef CONFIG_RISCV_ISA_V +static int riscv_vr_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct __riscv_v_ext_state *vstate = &target->thread.vstate; + struct __riscv_v_regset_state ptrace_vstate; + + if (!riscv_v_vstate_query(task_pt_regs(target))) + return -EINVAL; + + /* + * Ensure the vector registers have been saved to the memory before + * copying them to membuf. + */ + if (target == current) + riscv_v_vstate_save(current, task_pt_regs(current)); + + ptrace_vstate.vstart = vstate->vstart; + ptrace_vstate.vl = vstate->vl; + ptrace_vstate.vtype = vstate->vtype; + ptrace_vstate.vcsr = vstate->vcsr; + ptrace_vstate.vlenb = vstate->vlenb; + + /* Copy vector header from vstate. */ + membuf_write(&to, &ptrace_vstate, sizeof(struct __riscv_v_regset_state)); + + /* Copy all the vector registers from vstate. */ + return membuf_write(&to, vstate->datap, riscv_v_vsize); +} + +static int riscv_vr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct __riscv_v_ext_state *vstate = &target->thread.vstate; + struct __riscv_v_regset_state ptrace_vstate; + + if (!riscv_v_vstate_query(task_pt_regs(target))) + return -EINVAL; + + /* Copy rest of the vstate except datap */ + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ptrace_vstate, 0, + sizeof(struct __riscv_v_regset_state)); + if (unlikely(ret)) + return ret; + + if (vstate->vlenb != ptrace_vstate.vlenb) + return -EINVAL; + + vstate->vstart = ptrace_vstate.vstart; + vstate->vl = ptrace_vstate.vl; + vstate->vtype = ptrace_vstate.vtype; + vstate->vcsr = ptrace_vstate.vcsr; + + /* Copy all the vector registers. */ + pos = 0; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vstate->datap, + 0, riscv_v_vsize); + return ret; +} +#endif + static const struct user_regset riscv_user_regset[] = { [REGSET_X] = { .core_note_type = NT_PRSTATUS, @@ -100,6 +168,17 @@ static const struct user_regset riscv_user_regset[] = { .set = riscv_fpr_set, }, #endif +#ifdef CONFIG_RISCV_ISA_V + [REGSET_V] = { + .core_note_type = NT_RISCV_VECTOR, + .align = 16, + .n = ((32 * RISCV_MAX_VLENB) + + sizeof(struct __riscv_v_regset_state)) / sizeof(__u32), + .size = sizeof(__u32), + .regset_get = riscv_vr_get, + .set = riscv_vr_set, + }, +#endif }; static const struct user_regset_view riscv_user_native_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index e0e159138331..20e285fdbc46 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -443,6 +443,8 @@ typedef struct elf64_shdr { #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ #define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ #define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ +#define NT_RISCV_CSR 0x900 /* RISC-V Control and Status Registers */ +#define NT_RISCV_VECTOR 0x901 /* RISC-V vector registers */ #define NT_LOONGARCH_CPUCFG 0xa00 /* LoongArch CPU config registers */ #define NT_LOONGARCH_CSR 0xa01 /* LoongArch control and status registers */ #define NT_LOONGARCH_LSX 0xa02 /* LoongArch Loongson SIMD Extension registers */ -- cgit v1.2.3 From 84fe419dc7578b03e721b9bd6eb07947db70fd0e Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Sat, 22 Jul 2023 14:38:46 +0200 Subject: riscv: Introduce virtual kernel mapping KASLR KASLR implementation relies on a relocatable kernel so that we can move the kernel mapping. The seed needed to virtually move the kernel is taken from the device tree, so we rely on the bootloader to provide a correct seed. Zkr could be used unconditionnally instead if implemented, but that's for another patch. Signed-off-by: Alexandre Ghiti Tested-by: Conor Dooley Tested-by: Song Shuai Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Link: https://lore.kernel.org/r/20230722123850.634544-2-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 19 +++++++++++++++++++ arch/riscv/include/asm/page.h | 3 +++ arch/riscv/kernel/pi/Makefile | 2 +- arch/riscv/kernel/pi/cmdline_early.c | 13 +++++++++++++ arch/riscv/kernel/pi/fdt_early.c | 30 ++++++++++++++++++++++++++++++ arch/riscv/mm/init.c | 36 +++++++++++++++++++++++++++++++++++- 6 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 arch/riscv/kernel/pi/fdt_early.c (limited to 'arch/riscv') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4c07b9189c86..6a606d5b74c6 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -719,6 +719,25 @@ config RELOCATABLE If unsure, say N. +config RANDOMIZE_BASE + bool "Randomize the address of the kernel image" + select RELOCATABLE + depends on MMU && 64BIT && !XIP_KERNEL + help + Randomizes the virtual address at which the kernel image is + loaded, as a security feature that deters exploit attempts + relying on knowledge of the location of kernel internals. + + It is the bootloader's job to provide entropy, by passing a + random u64 value in /chosen/kaslr-seed at kernel entry. + + When booting via the UEFI stub, it will invoke the firmware's + EFI_RNG_PROTOCOL implementation (if available) to supply entropy + to the kernel proper. In addition, it will randomise the physical + location of the kernel Image as well. + + If unsure, say N. + endmenu # "Kernel features" menu "Boot options" diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index b55ba20903ec..5488ecc337b6 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -106,6 +106,7 @@ typedef struct page *pgtable_t; struct kernel_mapping { unsigned long page_offset; unsigned long virt_addr; + unsigned long virt_offset; uintptr_t phys_addr; uintptr_t size; /* Offset between linear mapping virtual address and kernel load address */ @@ -185,6 +186,8 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); #define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x)) +unsigned long kaslr_offset(void); + #endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) ({ \ diff --git a/arch/riscv/kernel/pi/Makefile b/arch/riscv/kernel/pi/Makefile index 7b593d44c712..07915dc9279e 100644 --- a/arch/riscv/kernel/pi/Makefile +++ b/arch/riscv/kernel/pi/Makefile @@ -35,5 +35,5 @@ $(obj)/string.o: $(srctree)/lib/string.c FORCE $(obj)/ctype.o: $(srctree)/lib/ctype.c FORCE $(call if_changed_rule,cc_o_c) -obj-y := cmdline_early.pi.o string.pi.o ctype.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o +obj-y := cmdline_early.pi.o fdt_early.pi.o string.pi.o ctype.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) diff --git a/arch/riscv/kernel/pi/cmdline_early.c b/arch/riscv/kernel/pi/cmdline_early.c index 05652d13c746..68e786c84c94 100644 --- a/arch/riscv/kernel/pi/cmdline_early.c +++ b/arch/riscv/kernel/pi/cmdline_early.c @@ -14,6 +14,7 @@ static char early_cmdline[COMMAND_LINE_SIZE]; * LLVM complain because the function is actually unused in this file). */ u64 set_satp_mode_from_cmdline(uintptr_t dtb_pa); +bool set_nokaslr_from_cmdline(uintptr_t dtb_pa); static char *get_early_cmdline(uintptr_t dtb_pa) { @@ -60,3 +61,15 @@ u64 set_satp_mode_from_cmdline(uintptr_t dtb_pa) return match_noXlvl(cmdline); } + +static bool match_nokaslr(char *cmdline) +{ + return strstr(cmdline, "nokaslr"); +} + +bool set_nokaslr_from_cmdline(uintptr_t dtb_pa) +{ + char *cmdline = get_early_cmdline(dtb_pa); + + return match_nokaslr(cmdline); +} diff --git a/arch/riscv/kernel/pi/fdt_early.c b/arch/riscv/kernel/pi/fdt_early.c new file mode 100644 index 000000000000..899610e042ab --- /dev/null +++ b/arch/riscv/kernel/pi/fdt_early.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +/* + * Declare the functions that are exported (but prefixed) here so that LLVM + * does not complain it lacks the 'static' keyword (which, if added, makes + * LLVM complain because the function is actually unused in this file). + */ +u64 get_kaslr_seed(uintptr_t dtb_pa); + +u64 get_kaslr_seed(uintptr_t dtb_pa) +{ + int node, len; + fdt64_t *prop; + u64 ret; + + node = fdt_path_offset((void *)dtb_pa, "/chosen"); + if (node < 0) + return 0; + + prop = fdt_getprop_w((void *)dtb_pa, node, "kaslr-seed", &len); + if (!prop || len != sizeof(u64)) + return 0; + + ret = fdt64_to_cpu(*prop); + *prop = 0; + return ret; +} diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 70fb31960b63..ff926531236e 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1012,11 +1012,45 @@ static void __init pt_ops_set_late(void) #endif } +#ifdef CONFIG_RANDOMIZE_BASE +extern bool __init __pi_set_nokaslr_from_cmdline(uintptr_t dtb_pa); +extern u64 __init __pi_get_kaslr_seed(uintptr_t dtb_pa); + +static int __init print_nokaslr(char *p) +{ + pr_info("Disabled KASLR"); + return 0; +} +early_param("nokaslr", print_nokaslr); + +unsigned long kaslr_offset(void) +{ + return kernel_map.virt_offset; +} +#endif + asmlinkage void __init setup_vm(uintptr_t dtb_pa) { pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; - kernel_map.virt_addr = KERNEL_LINK_ADDR; +#ifdef CONFIG_RANDOMIZE_BASE + if (!__pi_set_nokaslr_from_cmdline(dtb_pa)) { + u64 kaslr_seed = __pi_get_kaslr_seed(dtb_pa); + u32 kernel_size = (uintptr_t)(&_end) - (uintptr_t)(&_start); + u32 nr_pos; + + /* + * Compute the number of positions available: we are limited + * by the early page table that only has one PUD and we must + * be aligned on PMD_SIZE. + */ + nr_pos = (PUD_SIZE - kernel_size) / PMD_SIZE; + + kernel_map.virt_offset = (kaslr_seed % nr_pos) * PMD_SIZE; + } +#endif + + kernel_map.virt_addr = KERNEL_LINK_ADDR + kernel_map.virt_offset; kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); #ifdef CONFIG_XIP_KERNEL -- cgit v1.2.3 From 54a519e6aff9a6bc8922149e94f89c4bf4e3e8fb Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Sat, 22 Jul 2023 14:38:47 +0200 Subject: riscv: Dump out kernel offset information on panic Dump out the KASLR virtual kernel offset when panic to help debug kernel. Signed-off-by: Zong Li Signed-off-by: Alexandre Ghiti Tested-by: Conor Dooley Tested-by: Song Shuai Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Link: https://lore.kernel.org/r/20230722123850.634544-3-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/riscv') diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 971fe776e2f8..0fb5a26ca4cc 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -341,3 +342,27 @@ void free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } + +static int dump_kernel_offset(struct notifier_block *self, + unsigned long v, void *p) +{ + pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n", + kernel_map.virt_offset, + KERNEL_LINK_ADDR); + + return 0; +} + +static struct notifier_block kernel_offset_notifier = { + .notifier_call = dump_kernel_offset +}; + +static int __init register_kernel_offset_dumper(void) +{ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + atomic_notifier_chain_register(&panic_notifier_list, + &kernel_offset_notifier); + + return 0; +} +device_initcall(register_kernel_offset_dumper); -- cgit v1.2.3 From b7ac4b8ee73d4fec0998664c9dd61f089d481044 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Sat, 22 Jul 2023 14:38:50 +0200 Subject: riscv: libstub: Implement KASLR by using generic functions We can now use arm64 functions to handle the move of the kernel physical mapping: if KASLR is enabled, we will try to get a random seed from the firmware, if not possible, the kernel will be moved to a location that suits its alignment constraints. Signed-off-by: Alexandre Ghiti Tested-by: Conor Dooley Tested-by: Song Shuai Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Link: https://lore.kernel.org/r/20230722123850.634544-6-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/efi.h | 2 ++ arch/riscv/kernel/image-vars.h | 1 + drivers/firmware/efi/libstub/Makefile | 2 +- drivers/firmware/efi/libstub/riscv-stub.c | 33 ++++++++++++++----------------- 4 files changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/riscv') diff --git a/arch/riscv/include/asm/efi.h b/arch/riscv/include/asm/efi.h index 29e9a0d84b16..00b24ba55035 100644 --- a/arch/riscv/include/asm/efi.h +++ b/arch/riscv/include/asm/efi.h @@ -51,4 +51,6 @@ void efi_virtmap_unload(void); unsigned long stext_offset(void); +void efi_icache_sync(unsigned long start, unsigned long end); + #endif /* _ASM_EFI_H */ diff --git a/arch/riscv/kernel/image-vars.h b/arch/riscv/kernel/image-vars.h index 15616155008c..ea1a10355ce9 100644 --- a/arch/riscv/kernel/image-vars.h +++ b/arch/riscv/kernel/image-vars.h @@ -27,6 +27,7 @@ __efistub__start = _start; __efistub__start_kernel = _start_kernel; __efistub__end = _end; __efistub__edata = _edata; +__efistub___init_text_end = __init_text_end; __efistub_screen_info = screen_info; #endif diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 11aba8a041ec..dc90a31b189f 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -88,7 +88,7 @@ lib-$(CONFIG_EFI_GENERIC_STUB) += efi-stub.o string.o intrinsics.o systable.o \ lib-$(CONFIG_ARM) += arm32-stub.o lib-$(CONFIG_ARM64) += kaslr.o arm64.o arm64-stub.o smbios.o lib-$(CONFIG_X86) += x86-stub.o -lib-$(CONFIG_RISCV) += riscv.o riscv-stub.o +lib-$(CONFIG_RISCV) += kaslr.o riscv.o riscv-stub.o lib-$(CONFIG_LOONGARCH) += loongarch.o loongarch-stub.o CFLAGS_arm32-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) diff --git a/drivers/firmware/efi/libstub/riscv-stub.c b/drivers/firmware/efi/libstub/riscv-stub.c index 145c9f0ba217..c96d6dcee86c 100644 --- a/drivers/firmware/efi/libstub/riscv-stub.c +++ b/drivers/firmware/efi/libstub/riscv-stub.c @@ -30,32 +30,29 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, efi_loaded_image_t *image, efi_handle_t image_handle) { - unsigned long kernel_size = 0; - unsigned long preferred_addr; + unsigned long kernel_size, kernel_codesize, kernel_memsize; efi_status_t status; kernel_size = _edata - _start; + kernel_codesize = __init_text_end - _start; + kernel_memsize = kernel_size + (_end - _edata); *image_addr = (unsigned long)_start; - *image_size = kernel_size + (_end - _edata); - - /* - * RISC-V kernel maps PAGE_OFFSET virtual address to the same physical - * address where kernel is booted. That's why kernel should boot from - * as low as possible to avoid wastage of memory. Currently, dram_base - * is occupied by the firmware. So the preferred address for kernel to - * boot is next aligned address. If preferred address is not available, - * relocate_kernel will fall back to efi_low_alloc_above to allocate - * lowest possible memory region as long as the address and size meets - * the alignment constraints. - */ - preferred_addr = EFI_KIMG_PREFERRED_ADDRESS; - status = efi_relocate_kernel(image_addr, kernel_size, *image_size, - preferred_addr, efi_get_kimg_min_align(), - 0x0); + *image_size = kernel_memsize; + *reserve_size = *image_size; + status = efi_kaslr_relocate_kernel(image_addr, + reserve_addr, reserve_size, + kernel_size, kernel_codesize, kernel_memsize, + efi_kaslr_get_phys_seed(image_handle)); if (status != EFI_SUCCESS) { efi_err("Failed to relocate kernel\n"); *image_size = 0; } + return status; } + +void efi_icache_sync(unsigned long start, unsigned long end) +{ + asm volatile ("fence.i" ::: "memory"); +} -- cgit v1.2.3 From 9721873c3c023254f04138bbb21d539b4f0f0ad6 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 31 Aug 2023 13:12:27 +0000 Subject: riscv: extend patch_text_nosync() for multiple pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch_insn_write() function currently doesn't work for multiple pages of instructions, therefore patch_text_nosync() will fail with a page fault if called with lengths spanning multiple pages. This commit extends the patch_insn_write() function to support multiple pages by copying at max 2 pages at a time in a loop. This implementation is similar to text_poke_copy() function of x86. Signed-off-by: Puranjay Mohan Reviewed-by: Pu Lehui Reviewed-by: Björn Töpel Tested-by: Björn Töpel Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20230831131229.497941-3-puranjay12@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/patch.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'arch/riscv') diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c index 575e71d6c8ae..2c97e246f4dc 100644 --- a/arch/riscv/kernel/patch.c +++ b/arch/riscv/kernel/patch.c @@ -53,12 +53,18 @@ static void patch_unmap(int fixmap) } NOKPROBE_SYMBOL(patch_unmap); -static int patch_insn_write(void *addr, const void *insn, size_t len) +static int __patch_insn_write(void *addr, const void *insn, size_t len) { void *waddr = addr; bool across_pages = (((uintptr_t) addr & ~PAGE_MASK) + len) > PAGE_SIZE; int ret; + /* + * Only two pages can be mapped at a time for writing. + */ + if (len + offset_in_page(addr) > 2 * PAGE_SIZE) + return -EINVAL; + /* * Before reaching here, it was expected to lock the text_mutex * already, so we don't need to give another lock here and could @@ -74,7 +80,7 @@ static int patch_insn_write(void *addr, const void *insn, size_t len) lockdep_assert_held(&text_mutex); if (across_pages) - patch_map(addr + len, FIX_TEXT_POKE1); + patch_map(addr + PAGE_SIZE, FIX_TEXT_POKE1); waddr = patch_map(addr, FIX_TEXT_POKE0); @@ -87,15 +93,36 @@ static int patch_insn_write(void *addr, const void *insn, size_t len) return ret; } -NOKPROBE_SYMBOL(patch_insn_write); +NOKPROBE_SYMBOL(__patch_insn_write); #else -static int patch_insn_write(void *addr, const void *insn, size_t len) +static int __patch_insn_write(void *addr, const void *insn, size_t len) { return copy_to_kernel_nofault(addr, insn, len); } -NOKPROBE_SYMBOL(patch_insn_write); +NOKPROBE_SYMBOL(__patch_insn_write); #endif /* CONFIG_MMU */ +static int patch_insn_write(void *addr, const void *insn, size_t len) +{ + size_t patched = 0; + size_t size; + int ret = 0; + + /* + * Copy the instructions to the destination address, two pages at a time + * because __patch_insn_write() can only handle len <= 2 * PAGE_SIZE. + */ + while (patched < len && !ret) { + size = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(addr + patched), len - patched); + ret = __patch_insn_write(addr + patched, insn + patched, size); + + patched += size; + } + + return ret; +} +NOKPROBE_SYMBOL(patch_insn_write); + int patch_text_nosync(void *addr, const void *insns, size_t len) { u32 *tp = addr; -- cgit v1.2.3 From cad539baa48ff257b598000a90db2b7edd4b2dd5 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 31 Aug 2023 13:12:28 +0000 Subject: riscv: implement a memset like function for text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BPF JIT needs to write invalid instructions to RX regions of memory to invalidate removed BPF programs. This needs a function like memset() that can work with RX memory. Implement patch_text_set_nosync() which is similar to text_poke_set() of x86. Signed-off-by: Puranjay Mohan Reviewed-by: Pu Lehui Acked-by: Björn Töpel Tested-by: Björn Töpel Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20230831131229.497941-4-puranjay12@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/patch.h | 1 + arch/riscv/kernel/patch.c | 77 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) (limited to 'arch/riscv') diff --git a/arch/riscv/include/asm/patch.h b/arch/riscv/include/asm/patch.h index 63c98833d510..e88b52d39eac 100644 --- a/arch/riscv/include/asm/patch.h +++ b/arch/riscv/include/asm/patch.h @@ -7,6 +7,7 @@ #define _ASM_RISCV_PATCH_H int patch_text_nosync(void *addr, const void *insns, size_t len); +int patch_text_set_nosync(void *addr, u8 c, size_t len); int patch_text(void *addr, u32 *insns, int ninsns); extern int riscv_patch_in_stop_machine; diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c index 2c97e246f4dc..13ee7bf589a1 100644 --- a/arch/riscv/kernel/patch.c +++ b/arch/riscv/kernel/patch.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -53,6 +54,39 @@ static void patch_unmap(int fixmap) } NOKPROBE_SYMBOL(patch_unmap); +static int __patch_insn_set(void *addr, u8 c, size_t len) +{ + void *waddr = addr; + bool across_pages = (((uintptr_t)addr & ~PAGE_MASK) + len) > PAGE_SIZE; + + /* + * Only two pages can be mapped at a time for writing. + */ + if (len + offset_in_page(addr) > 2 * PAGE_SIZE) + return -EINVAL; + /* + * Before reaching here, it was expected to lock the text_mutex + * already, so we don't need to give another lock here and could + * ensure that it was safe between each cores. + */ + lockdep_assert_held(&text_mutex); + + if (across_pages) + patch_map(addr + PAGE_SIZE, FIX_TEXT_POKE1); + + waddr = patch_map(addr, FIX_TEXT_POKE0); + + memset(waddr, c, len); + + patch_unmap(FIX_TEXT_POKE0); + + if (across_pages) + patch_unmap(FIX_TEXT_POKE1); + + return 0; +} +NOKPROBE_SYMBOL(__patch_insn_set); + static int __patch_insn_write(void *addr, const void *insn, size_t len) { void *waddr = addr; @@ -95,6 +129,14 @@ static int __patch_insn_write(void *addr, const void *insn, size_t len) } NOKPROBE_SYMBOL(__patch_insn_write); #else +static int __patch_insn_set(void *addr, u8 c, size_t len) +{ + memset(addr, c, len); + + return 0; +} +NOKPROBE_SYMBOL(__patch_insn_set); + static int __patch_insn_write(void *addr, const void *insn, size_t len) { return copy_to_kernel_nofault(addr, insn, len); @@ -102,6 +144,41 @@ static int __patch_insn_write(void *addr, const void *insn, size_t len) NOKPROBE_SYMBOL(__patch_insn_write); #endif /* CONFIG_MMU */ +static int patch_insn_set(void *addr, u8 c, size_t len) +{ + size_t patched = 0; + size_t size; + int ret = 0; + + /* + * __patch_insn_set() can only work on 2 pages at a time so call it in a + * loop with len <= 2 * PAGE_SIZE. + */ + while (patched < len && !ret) { + size = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(addr + patched), len - patched); + ret = __patch_insn_set(addr + patched, c, size); + + patched += size; + } + + return ret; +} +NOKPROBE_SYMBOL(patch_insn_set); + +int patch_text_set_nosync(void *addr, u8 c, size_t len) +{ + u32 *tp = addr; + int ret; + + ret = patch_insn_set(tp, c, len); + + if (!ret) + flush_icache_range((uintptr_t)tp, (uintptr_t)tp + len); + + return ret; +} +NOKPROBE_SYMBOL(patch_text_set_nosync); + static int patch_insn_write(void *addr, const void *insn, size_t len) { size_t patched = 0; -- cgit v1.2.3 From 48a8f78c50bd6f7f08fd40daa62252fd043f2f18 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 31 Aug 2023 13:12:29 +0000 Subject: bpf, riscv: use prog pack allocator in the BPF JIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use bpf_jit_binary_pack_alloc() for memory management of JIT binaries in RISCV BPF JIT. The bpf_jit_binary_pack_alloc creates a pair of RW and RX buffers. The JIT writes the program into the RW buffer. When the JIT is done, the program is copied to the final RX buffer with bpf_jit_binary_pack_finalize. Implement bpf_arch_text_copy() and bpf_arch_text_invalidate() for RISCV JIT as these functions are required by bpf_jit_binary_pack allocator. Signed-off-by: Puranjay Mohan Reviewed-by: Song Liu Reviewed-by: Pu Lehui Acked-by: Björn Töpel Tested-by: Björn Töpel Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20230831131229.497941-5-puranjay12@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/net/bpf_jit.h | 3 ++ arch/riscv/net/bpf_jit_comp64.c | 60 ++++++++++++++++++----- arch/riscv/net/bpf_jit_core.c | 106 ++++++++++++++++++++++++++++++++++------ 3 files changed, 141 insertions(+), 28 deletions(-) (limited to 'arch/riscv') diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index d21c6c92a683..a5ce1ab76ece 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -68,6 +68,7 @@ static inline bool is_creg(u8 reg) struct rv_jit_context { struct bpf_prog *prog; u16 *insns; /* RV insns */ + u16 *ro_insns; int ninsns; int prologue_len; int epilogue_offset; @@ -85,7 +86,9 @@ static inline int ninsns_rvoff(int ninsns) struct rv_jit_data { struct bpf_binary_header *header; + struct bpf_binary_header *ro_header; u8 *image; + u8 *ro_image; struct rv_jit_context ctx; }; diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 8423f4ddf8f5..ecd3ae6f4116 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -144,7 +144,11 @@ static bool in_auipc_jalr_range(s64 val) /* Emit fixed-length instructions for address */ static int emit_addr(u8 rd, u64 addr, bool extra_pass, struct rv_jit_context *ctx) { - u64 ip = (u64)(ctx->insns + ctx->ninsns); + /* + * Use the ro_insns(RX) to calculate the offset as the BPF program will + * finally run from this memory region. + */ + u64 ip = (u64)(ctx->ro_insns + ctx->ninsns); s64 off = addr - ip; s64 upper = (off + (1 << 11)) >> 12; s64 lower = off & 0xfff; @@ -464,8 +468,12 @@ static int emit_call(u64 addr, bool fixed_addr, struct rv_jit_context *ctx) s64 off = 0; u64 ip; - if (addr && ctx->insns) { - ip = (u64)(long)(ctx->insns + ctx->ninsns); + if (addr && ctx->insns && ctx->ro_insns) { + /* + * Use the ro_insns(RX) to calculate the offset as the BPF + * program will finally run from this memory region. + */ + ip = (u64)(long)(ctx->ro_insns + ctx->ninsns); off = addr - ip; } @@ -578,9 +586,10 @@ static int add_exception_handler(const struct bpf_insn *insn, { struct exception_table_entry *ex; unsigned long pc; - off_t offset; + off_t ins_offset; + off_t fixup_offset; - if (!ctx->insns || !ctx->prog->aux->extable || + if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable || (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX)) return 0; @@ -594,12 +603,17 @@ static int add_exception_handler(const struct bpf_insn *insn, return -EINVAL; ex = &ctx->prog->aux->extable[ctx->nexentries]; - pc = (unsigned long)&ctx->insns[ctx->ninsns - insn_len]; + pc = (unsigned long)&ctx->ro_insns[ctx->ninsns - insn_len]; - offset = pc - (long)&ex->insn; - if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN)) + /* + * This is the relative offset of the instruction that may fault from + * the exception table itself. This will be written to the exception + * table and if this instruction faults, the destination register will + * be set to '0' and the execution will jump to the next instruction. + */ + ins_offset = pc - (long)&ex->insn; + if (WARN_ON_ONCE(ins_offset >= 0 || ins_offset < INT_MIN)) return -ERANGE; - ex->insn = offset; /* * Since the extable follows the program, the fixup offset is always @@ -608,12 +622,25 @@ static int add_exception_handler(const struct bpf_insn *insn, * bits. We don't need to worry about buildtime or runtime sort * modifying the upper bits because the table is already sorted, and * isn't part of the main exception table. + * + * The fixup_offset is set to the next instruction from the instruction + * that may fault. The execution will jump to this after handling the + * fault. */ - offset = (long)&ex->fixup - (pc + insn_len * sizeof(u16)); - if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, offset)) + fixup_offset = (long)&ex->fixup - (pc + insn_len * sizeof(u16)); + if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, fixup_offset)) return -ERANGE; - ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, offset) | + /* + * The offsets above have been calculated using the RO buffer but we + * need to use the R/W buffer for writes. + * switch ex to rw buffer for writing. + */ + ex = (void *)ctx->insns + ((void *)ex - (void *)ctx->ro_insns); + + ex->insn = ins_offset; + + ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) | FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); ex->type = EX_TYPE_BPF; @@ -1007,6 +1034,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, ctx.ninsns = 0; ctx.insns = NULL; + ctx.ro_insns = NULL; ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, flags, &ctx); if (ret < 0) return ret; @@ -1015,7 +1043,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, return -EFBIG; ctx.ninsns = 0; + /* + * The bpf_int_jit_compile() uses a RW buffer (ctx.insns) to write the + * JITed instructions and later copies it to a RX region (ctx.ro_insns). + * It also uses ctx.ro_insns to calculate offsets for jumps etc. As the + * trampoline image uses the same memory area for writing and execution, + * both ctx.insns and ctx.ro_insns can be set to image. + */ ctx.insns = image; + ctx.ro_insns = image; ret = __arch_prepare_bpf_trampoline(im, m, tlinks, func_addr, flags, &ctx); if (ret < 0) return ret; diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index 7a26a3e1c73c..7b70ccb7fec3 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -8,6 +8,8 @@ #include #include +#include +#include #include "bpf_jit.h" /* Number of iterations to try until offsets converge. */ @@ -117,16 +119,24 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) sizeof(struct exception_table_entry); prog_size = sizeof(*ctx->insns) * ctx->ninsns; - jit_data->header = - bpf_jit_binary_alloc(prog_size + extable_size, - &jit_data->image, - sizeof(u32), - bpf_fill_ill_insns); - if (!jit_data->header) { + jit_data->ro_header = + bpf_jit_binary_pack_alloc(prog_size + extable_size, + &jit_data->ro_image, sizeof(u32), + &jit_data->header, &jit_data->image, + bpf_fill_ill_insns); + if (!jit_data->ro_header) { prog = orig_prog; goto out_offset; } + /* + * Use the image(RW) for writing the JITed instructions. But also save + * the ro_image(RX) for calculating the offsets in the image. The RW + * image will be later copied to the RX image from where the program + * will run. The bpf_jit_binary_pack_finalize() will do this copy in the + * final step. + */ + ctx->ro_insns = (u16 *)jit_data->ro_image; ctx->insns = (u16 *)jit_data->image; /* * Now, when the image is allocated, the image can @@ -138,14 +148,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) if (i == NR_JIT_ITERATIONS) { pr_err("bpf-jit: image did not converge in <%d passes!\n", i); - if (jit_data->header) - bpf_jit_binary_free(jit_data->header); prog = orig_prog; - goto out_offset; + goto out_free_hdr; } if (extable_size) - prog->aux->extable = (void *)ctx->insns + prog_size; + prog->aux->extable = (void *)ctx->ro_insns + prog_size; skip_init_ctx: pass++; @@ -154,23 +162,33 @@ skip_init_ctx: bpf_jit_build_prologue(ctx); if (build_body(ctx, extra_pass, NULL)) { - bpf_jit_binary_free(jit_data->header); prog = orig_prog; - goto out_offset; + goto out_free_hdr; } bpf_jit_build_epilogue(ctx); if (bpf_jit_enable > 1) bpf_jit_dump(prog->len, prog_size, pass, ctx->insns); - prog->bpf_func = (void *)ctx->insns; + prog->bpf_func = (void *)ctx->ro_insns; prog->jited = 1; prog->jited_len = prog_size; - bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns); - if (!prog->is_func || extra_pass) { - bpf_jit_binary_lock_ro(jit_data->header); + if (WARN_ON(bpf_jit_binary_pack_finalize(prog, jit_data->ro_header, + jit_data->header))) { + /* ro_header has been freed */ + jit_data->ro_header = NULL; + prog = orig_prog; + goto out_offset; + } + /* + * The instructions have now been copied to the ROX region from + * where they will execute. + * Write any modified data cache blocks out to memory and + * invalidate the corresponding blocks in the instruction cache. + */ + bpf_flush_icache(jit_data->ro_header, ctx->ro_insns + ctx->ninsns); for (i = 0; i < prog->len; i++) ctx->offset[i] = ninsns_rvoff(ctx->offset[i]); bpf_prog_fill_jited_linfo(prog, ctx->offset); @@ -185,6 +203,14 @@ out: bpf_jit_prog_release_other(prog, prog == orig_prog ? tmp : orig_prog); return prog; + +out_free_hdr: + if (jit_data->header) { + bpf_arch_text_copy(&jit_data->ro_header->size, &jit_data->header->size, + sizeof(jit_data->header->size)); + bpf_jit_binary_pack_free(jit_data->ro_header, jit_data->header); + } + goto out_offset; } u64 bpf_jit_alloc_exec_limit(void) @@ -204,3 +230,51 @@ void bpf_jit_free_exec(void *addr) { return vfree(addr); } + +void *bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + int ret; + + mutex_lock(&text_mutex); + ret = patch_text_nosync(dst, src, len); + mutex_unlock(&text_mutex); + + if (ret) + return ERR_PTR(-EINVAL); + + return dst; +} + +int bpf_arch_text_invalidate(void *dst, size_t len) +{ + int ret; + + mutex_lock(&text_mutex); + ret = patch_text_set_nosync(dst, 0, len); + mutex_unlock(&text_mutex); + + return ret; +} + +void bpf_jit_free(struct bpf_prog *prog) +{ + if (prog->jited) { + struct rv_jit_data *jit_data = prog->aux->jit_data; + struct bpf_binary_header *hdr; + + /* + * If we fail the final pass of JIT (from jit_subprogs), + * the program may not be finalized yet. Call finalize here + * before freeing it. + */ + if (jit_data) { + bpf_jit_binary_pack_finalize(prog, jit_data->ro_header, jit_data->header); + kfree(jit_data); + } + hdr = bpf_jit_binary_pack_hdr(prog); + bpf_jit_binary_pack_free(hdr, NULL); + WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog)); + } + + bpf_prog_unlock_free(prog); +} -- cgit v1.2.3 From e7ddd00eb3752b109e24eee19cb6b7421059d4db Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Fri, 1 Sep 2023 11:51:11 +0100 Subject: riscv: Kconfig: Select DMA_DIRECT_REMAP only if MMU is enabled kernel/dma/mapping.c has its use of pgprot_dmacoherent() inside an #ifdef CONFIG_MMU block. kernel/dma/pool.c has its use of pgprot_dmacoherent() inside an #ifdef CONFIG_DMA_DIRECT_REMAP block. So select DMA_DIRECT_REMAP only if MMU is enabled for RISCV_DMA_NONCOHERENT config. This avoids users to explicitly select MMU. Suggested-by: Geert Uytterhoeven Signed-off-by: Lad Prabhakar Link: https://lore.kernel.org/r/20230901105111.311200-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/riscv') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 712ae0269131..d607ab0f7c6d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -273,7 +273,7 @@ config RISCV_DMA_NONCOHERENT select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE select DMA_BOUNCE_UNALIGNED_KMALLOC if SWIOTLB - select DMA_DIRECT_REMAP + select DMA_DIRECT_REMAP if MMU config RISCV_NONSTANDARD_CACHE_OPS bool -- cgit v1.2.3 From 54adc24c9a8f6e602f7e9d716107814d10281f8d Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Fri, 1 Sep 2023 11:58:58 +0100 Subject: riscv: Kconfig.errata: Drop dependency for MMU in ERRATA_ANDES_CMO config Now that RISCV_DMA_NONCOHERENT conditionally selects DMA_DIRECT_REMAP ie only if MMU is enabled, we no longer need the MMU dependency in ERRATA_ANDES_CMO config. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20230901105858.311745-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.errata | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/riscv') diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata index 92c779764b27..bee5d838763b 100644 --- a/arch/riscv/Kconfig.errata +++ b/arch/riscv/Kconfig.errata @@ -12,7 +12,7 @@ config ERRATA_ANDES config ERRATA_ANDES_CMO bool "Apply Andes cache management errata" - depends on ERRATA_ANDES && MMU && ARCH_R9A07G043 + depends on ERRATA_ANDES && ARCH_R9A07G043 select RISCV_DMA_NONCOHERENT default y help -- cgit v1.2.3 From 2f73b35d79d06f5354c859c31e5946391efc45c8 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Fri, 1 Sep 2023 12:03:20 +0100 Subject: riscv: Kconfig.errata: Add dependency for RISCV_SBI in ERRATA_ANDES config Andes errata uses sbi_ecalll() which is only available if RISCV_SBI is enabled. So add an dependency for RISCV_SBI in ERRATA_ANDES config to avoid any build failures. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308311610.ec6bm2G8-lkp@intel.com/ Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Tested-by: Randy Dunlap Link: https://lore.kernel.org/r/20230901110320.312674-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.errata | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/riscv') diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata index bee5d838763b..566bcefeab50 100644 --- a/arch/riscv/Kconfig.errata +++ b/arch/riscv/Kconfig.errata @@ -2,7 +2,7 @@ menu "CPU errata selection" config ERRATA_ANDES bool "Andes AX45MP errata" - depends on RISCV_ALTERNATIVE + depends on RISCV_ALTERNATIVE && RISCV_SBI help All Andes errata Kconfig depend on this Kconfig. Disabling this Kconfig will disable all Andes errata. Please say "Y" -- cgit v1.2.3