From 23ad288aaf153a78b042e10062be1a6284909f95 Mon Sep 17 00:00:00 2001 From: Qinglin Pan Date: Thu, 9 Feb 2023 21:16:45 +0800 Subject: riscv: mm: modify pte format for Svnapot Add one alternative to enable/disable svnapot support, enable this static key when "svnapot" is in the "riscv,isa" field of fdt and SVNAPOT compile option is set. It will influence the behavior of has_svnapot. All code dependent on svnapot should make sure that has_svnapot return true firstly. Modify PTE definition for Svnapot, and creates some functions in pgtable.h to mark a PTE as napot and check if it is a Svnapot PTE. Until now, only 64KB napot size is supported in spec, so some macros has only 64KB version. Signed-off-by: Qinglin Pan Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20230209131647.17245-2-panqinglin00@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 19 ++++++++++++++++++ arch/riscv/include/asm/hwcap.h | 9 +++++---- arch/riscv/include/asm/page.h | 5 ----- arch/riscv/include/asm/pgtable-64.h | 34 ++++++++++++++++++++++++++++++++ arch/riscv/include/asm/pgtable.h | 39 ++++++++++++++++++++++++++++++++++++- arch/riscv/kernel/cpu.c | 1 + arch/riscv/kernel/cpufeature.c | 1 + 7 files changed, 98 insertions(+), 10 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7c814fbf9527..abbb7b94488d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -397,6 +397,25 @@ config RISCV_ISA_C If you don't know what to do here, say Y. +config RISCV_ISA_SVNAPOT + bool "SVNAPOT extension support" + depends on 64BIT && MMU + default y + select RISCV_ALTERNATIVE + help + Allow kernel to detect the SVNAPOT ISA-extension dynamically at boot + time and enable its usage. + + The SVNAPOT extension is used to mark contiguous PTEs as a range + of contiguous virtual-to-physical translations for a naturally + aligned power-of-2 (NAPOT) granularity larger than the base 4KB page + size. When HUGETLBFS is also selected this option unconditionally + allocates some memory for each NAPOT page size supported by the kernel. + When optimizing for low memory consumption and for platforms without + the SVNAPOT extension, it may be better to say N here. + + If you don't know what to do here, say Y. + config RISCV_ISA_SVPBMT bool "SVPBMT extension support" depends on 64BIT && MMU diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index ee9c80fe0062..6e368d3f6631 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -43,10 +43,11 @@ #define RISCV_ISA_EXT_SSCOFPMF 26 #define RISCV_ISA_EXT_SSTC 27 #define RISCV_ISA_EXT_SVINVAL 28 -#define RISCV_ISA_EXT_SVPBMT 29 -#define RISCV_ISA_EXT_ZBB 30 -#define RISCV_ISA_EXT_ZICBOM 31 -#define RISCV_ISA_EXT_ZIHINTPAUSE 32 +#define RISCV_ISA_EXT_SVNAPOT 29 +#define RISCV_ISA_EXT_SVPBMT 30 +#define RISCV_ISA_EXT_ZBB 31 +#define RISCV_ISA_EXT_ZICBOM 32 +#define RISCV_ISA_EXT_ZIHINTPAUSE 33 #ifndef __ASSEMBLY__ diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 9f432c1b5289..24a3dd265183 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -16,11 +16,6 @@ #define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE - 1)) -#ifdef CONFIG_64BIT -#define HUGE_MAX_HSTATE 2 -#else -#define HUGE_MAX_HSTATE 1 -#endif #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 42a042c0e13e..7a5097202e15 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -78,6 +78,40 @@ typedef struct { */ #define _PAGE_PFN_MASK GENMASK(53, 10) +/* + * [63] Svnapot definitions: + * 0 Svnapot disabled + * 1 Svnapot enabled + */ +#define _PAGE_NAPOT_SHIFT 63 +#define _PAGE_NAPOT BIT(_PAGE_NAPOT_SHIFT) +/* + * Only 64KB (order 4) napot ptes supported. + */ +#define NAPOT_CONT_ORDER_BASE 4 +enum napot_cont_order { + NAPOT_CONT64KB_ORDER = NAPOT_CONT_ORDER_BASE, + NAPOT_ORDER_MAX, +}; + +#define for_each_napot_order(order) \ + for (order = NAPOT_CONT_ORDER_BASE; order < NAPOT_ORDER_MAX; order++) +#define for_each_napot_order_rev(order) \ + for (order = NAPOT_ORDER_MAX - 1; \ + order >= NAPOT_CONT_ORDER_BASE; order--) +#define napot_cont_order(val) (__builtin_ctzl((val.pte >> _PAGE_PFN_SHIFT) << 1)) + +#define napot_cont_shift(order) ((order) + PAGE_SHIFT) +#define napot_cont_size(order) BIT(napot_cont_shift(order)) +#define napot_cont_mask(order) (~(napot_cont_size(order) - 1UL)) +#define napot_pte_num(order) BIT(order) + +#ifdef CONFIG_RISCV_ISA_SVNAPOT +#define HUGE_MAX_HSTATE (2 + (NAPOT_ORDER_MAX - NAPOT_CONT_ORDER_BASE)) +#else +#define HUGE_MAX_HSTATE 2 +#endif + /* * [62:61] Svpbmt Memory Type definitions: * diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 2a88362dffa5..76502bc7bef2 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -264,10 +264,47 @@ static inline pte_t pud_pte(pud_t pud) return __pte(pud_val(pud)); } +#ifdef CONFIG_RISCV_ISA_SVNAPOT + +static __always_inline bool has_svnapot(void) +{ + return riscv_has_extension_likely(RISCV_ISA_EXT_SVNAPOT); +} + +static inline unsigned long pte_napot(pte_t pte) +{ + return pte_val(pte) & _PAGE_NAPOT; +} + +static inline pte_t pte_mknapot(pte_t pte, unsigned int order) +{ + int pos = order - 1 + _PAGE_PFN_SHIFT; + unsigned long napot_bit = BIT(pos); + unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT); + + return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT); +} + +#else + +static __always_inline bool has_svnapot(void) { return false; } + +static inline unsigned long pte_napot(pte_t pte) +{ + return 0; +} + +#endif /* CONFIG_RISCV_ISA_SVNAPOT */ + /* Yields the page frame number (PFN) of a page table entry */ static inline unsigned long pte_pfn(pte_t pte) { - return __page_val_to_pfn(pte_val(pte)); + unsigned long res = __page_val_to_pfn(pte_val(pte)); + + if (has_svnapot() && pte_napot(pte)) + res = res & (res - 1UL); + + return res; } #define pte_page(x) pfn_to_page(pte_pfn(x)) diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index 420228e219f7..5670909619c8 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -191,6 +191,7 @@ static struct riscv_isa_ext_data isa_ext_arr[] = { __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF), __RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC), __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL), + __RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT), __RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT), __RISCV_ISA_EXT_DATA("", RISCV_ISA_EXT_MAX), }; diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 21fb567e1b22..271e391d436d 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -223,6 +223,7 @@ void __init riscv_fill_hwcap(void) SET_ISA_EXT_MAP("sscofpmf", RISCV_ISA_EXT_SSCOFPMF); SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC); SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL); + SET_ISA_EXT_MAP("svnapot", RISCV_ISA_EXT_SVNAPOT); SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT); SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB); SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM); -- cgit v1.2.3 From 82a1a1f3bfb628c4a44652349b94c66c7e8be7c9 Mon Sep 17 00:00:00 2001 From: Qinglin Pan Date: Thu, 9 Feb 2023 21:16:46 +0800 Subject: riscv: mm: support Svnapot in hugetlb page Svnapot can be used to support 64KB hugetlb page, so it can become a new option when using hugetlbfs. Add a basic implementation of hugetlb page, and support 64KB as a size in it by using Svnapot. For test, boot kernel with command line contains "default_hugepagesz=64K hugepagesz=64K hugepages=20" and run a simple test like this: tools/testing/selftests/vm/map_hugetlb 1 16 And it should be passed. Signed-off-by: Qinglin Pan Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20230209131647.17245-3-panqinglin00@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 +- arch/riscv/include/asm/hugetlb.h | 34 ++++- arch/riscv/mm/hugetlbpage.c | 301 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 335 insertions(+), 2 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index abbb7b94488d..a81a89131bb9 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -44,7 +44,7 @@ config RISCV select ARCH_USE_QUEUED_RWLOCKS select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS - select ARCH_WANT_GENERAL_HUGETLB + select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index ec19d6afc896..fe6f23006641 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -2,7 +2,6 @@ #ifndef _ASM_RISCV_HUGETLB_H #define _ASM_RISCV_HUGETLB_H -#include #include static inline void arch_clear_hugepage_flags(struct page *page) @@ -11,4 +10,37 @@ static inline void arch_clear_hugepage_flags(struct page *page) } #define arch_clear_hugepage_flags arch_clear_hugepage_flags +#ifdef CONFIG_RISCV_ISA_SVNAPOT +#define __HAVE_ARCH_HUGE_PTE_CLEAR +void huge_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz); + +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT +void set_huge_pte_at(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, pte_t pte); + +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); + +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT +void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); + +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS +int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty); + +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags); +#define arch_make_huge_pte arch_make_huge_pte + +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/ + +#include + #endif /* _ASM_RISCV_HUGETLB_H */ diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 932dadfdca54..a163a3e0f0d4 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -2,6 +2,305 @@ #include #include +#ifdef CONFIG_RISCV_ISA_SVNAPOT +pte_t *huge_pte_alloc(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + unsigned long sz) +{ + unsigned long order; + pte_t *pte = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return NULL; + + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + goto out; + } + + if (sz == PMD_SIZE) { + if (want_pmd_share(vma, addr) && pud_none(*pud)) + pte = huge_pmd_share(mm, vma, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); + goto out; + } + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; + + for_each_napot_order(order) { + if (napot_cont_size(order) == sz) { + pte = pte_alloc_map(mm, pmd, addr & napot_cont_mask(order)); + break; + } + } + +out: + WARN_ON_ONCE(pte && pte_present(*pte) && !pte_huge(*pte)); + return pte; +} + +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, + unsigned long sz) +{ + unsigned long order; + pte_t *pte = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, addr); + if (sz == PUD_SIZE) + /* must be pud huge, non-present or none */ + return (pte_t *)pud; + + if (!pud_present(*pud)) + return NULL; + + pmd = pmd_offset(pud, addr); + if (sz == PMD_SIZE) + /* must be pmd huge, non-present or none */ + return (pte_t *)pmd; + + if (!pmd_present(*pmd)) + return NULL; + + for_each_napot_order(order) { + if (napot_cont_size(order) == sz) { + pte = pte_offset_kernel(pmd, addr & napot_cont_mask(order)); + break; + } + } + return pte; +} + +static pte_t get_clear_contig(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long pte_num) +{ + pte_t orig_pte = ptep_get(ptep); + unsigned long i; + + for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) { + pte_t pte = ptep_get_and_clear(mm, addr, ptep); + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + + return orig_pte; +} + +static pte_t get_clear_contig_flush(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long pte_num) +{ + pte_t orig_pte = get_clear_contig(mm, addr, ptep, pte_num); + struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + bool valid = !pte_none(orig_pte); + + if (valid) + flush_tlb_range(&vma, addr, addr + (PAGE_SIZE * pte_num)); + + return orig_pte; +} + +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) +{ + unsigned long order; + + for_each_napot_order(order) { + if (shift == napot_cont_shift(order)) { + entry = pte_mknapot(entry, order); + break; + } + } + if (order == NAPOT_ORDER_MAX) + entry = pte_mkhuge(entry); + + return entry; +} + +void set_huge_pte_at(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + pte_t pte) +{ + int i, pte_num; + + if (!pte_napot(pte)) { + set_pte_at(mm, addr, ptep, pte); + return; + } + + pte_num = napot_pte_num(napot_cont_order(pte)); + for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE) + set_pte_at(mm, addr, ptep, pte); +} + +int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, + pte_t *ptep, + pte_t pte, + int dirty) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long order; + pte_t orig_pte; + int i, pte_num; + + if (!pte_napot(pte)) + return ptep_set_access_flags(vma, addr, ptep, pte, dirty); + + order = napot_cont_order(pte); + pte_num = napot_pte_num(order); + ptep = huge_pte_offset(mm, addr, napot_cont_size(order)); + orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num); + + if (pte_dirty(orig_pte)) + pte = pte_mkdirty(pte); + + if (pte_young(orig_pte)) + pte = pte_mkyoung(pte); + + for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) + set_pte_at(mm, addr, ptep, pte); + + return true; +} + +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep) +{ + pte_t orig_pte = ptep_get(ptep); + int pte_num; + + if (!pte_napot(orig_pte)) + return ptep_get_and_clear(mm, addr, ptep); + + pte_num = napot_pte_num(napot_cont_order(orig_pte)); + + return get_clear_contig(mm, addr, ptep, pte_num); +} + +void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep) +{ + pte_t pte = ptep_get(ptep); + unsigned long order; + int i, pte_num; + + if (!pte_napot(pte)) { + ptep_set_wrprotect(mm, addr, ptep); + return; + } + + order = napot_cont_order(pte); + pte_num = napot_pte_num(order); + ptep = huge_pte_offset(mm, addr, napot_cont_size(order)); + + for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) + ptep_set_wrprotect(mm, addr, ptep); +} + +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, + pte_t *ptep) +{ + pte_t pte = ptep_get(ptep); + int pte_num; + + if (!pte_napot(pte)) + return ptep_clear_flush(vma, addr, ptep); + + pte_num = napot_pte_num(napot_cont_order(pte)); + + return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num); +} + +void huge_pte_clear(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long sz) +{ + pte_t pte = READ_ONCE(*ptep); + int i, pte_num; + + if (!pte_napot(pte)) { + pte_clear(mm, addr, ptep); + return; + } + + pte_num = napot_pte_num(napot_cont_order(pte)); + for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) + pte_clear(mm, addr, ptep); +} + +static __init bool is_napot_size(unsigned long size) +{ + unsigned long order; + + if (!has_svnapot()) + return false; + + for_each_napot_order(order) { + if (size == napot_cont_size(order)) + return true; + } + return false; +} + +static __init int napot_hugetlbpages_init(void) +{ + if (has_svnapot()) { + unsigned long order; + + for_each_napot_order(order) + hugetlb_add_hstate(order); + } + return 0; +} +arch_initcall(napot_hugetlbpages_init); + +#else + +static __init bool is_napot_size(unsigned long size) +{ + return false; +} + +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/ + int pud_huge(pud_t pud) { return pud_leaf(pud); @@ -18,6 +317,8 @@ bool __init arch_hugetlb_valid_size(unsigned long size) return true; else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE) return true; + else if (is_napot_size(size)) + return true; else return false; } -- cgit v1.2.3 From ce173474cf19fe7fbe8f0fc74e3c81ec9c3d9807 Mon Sep 17 00:00:00 2001 From: Qinglin Pan Date: Thu, 9 Feb 2023 21:16:47 +0800 Subject: riscv: mm: support Svnapot in huge vmap As HAVE_ARCH_HUGE_VMAP and HAVE_ARCH_HUGE_VMALLOC is supported, we can implement arch_vmap_pte_range_map_size and arch_vmap_pte_supported_shift for Svnapot to support huge vmap about napot size. It can be tested by huge vmap used in pci driver. Huge vmalloc with svnapot can be tested by test_vmalloc with [1] applied, and probe this module to run fix_size_alloc_test with use_huge true. [1]https://lore.kernel.org/all/20221212055657.698420-1-panqinglin2020@iscas.ac.cn/ Signed-off-by: Qinglin Pan Reviewed-by: Andrew Jones Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20230209131647.17245-4-panqinglin00@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/vmalloc.h | 61 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/vmalloc.h b/arch/riscv/include/asm/vmalloc.h index 48da5371f1e9..58d3e447f191 100644 --- a/arch/riscv/include/asm/vmalloc.h +++ b/arch/riscv/include/asm/vmalloc.h @@ -17,6 +17,65 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot) return true; } -#endif +#ifdef CONFIG_RISCV_ISA_SVNAPOT +#include +#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size +static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end, + u64 pfn, unsigned int max_page_shift) +{ + unsigned long map_size = PAGE_SIZE; + unsigned long size, order; + + if (!has_svnapot()) + return map_size; + + for_each_napot_order_rev(order) { + if (napot_cont_shift(order) > max_page_shift) + continue; + + size = napot_cont_size(order); + if (end - addr < size) + continue; + + if (!IS_ALIGNED(addr, size)) + continue; + + if (!IS_ALIGNED(PFN_PHYS(pfn), size)) + continue; + + map_size = size; + break; + } + + return map_size; +} + +#define arch_vmap_pte_supported_shift arch_vmap_pte_supported_shift +static inline int arch_vmap_pte_supported_shift(unsigned long size) +{ + int shift = PAGE_SHIFT; + unsigned long order; + + if (!has_svnapot()) + return shift; + + WARN_ON_ONCE(size >= PMD_SIZE); + + for_each_napot_order_rev(order) { + if (napot_cont_size(order) > size) + continue; + + if (!IS_ALIGNED(size, napot_cont_size(order))) + continue; + + shift = napot_cont_shift(order); + break; + } + + return shift; +} + +#endif /* CONFIG_RISCV_ISA_SVNAPOT */ +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #endif /* _ASM_RISCV_VMALLOC_H */ -- cgit v1.2.3 From 099122af4e29efb41e0db6cc2cd5a07278250e4f Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 16:45:56 +0100 Subject: riscv: Clarify RISCV_ALTERNATIVE help text Clarify RISCV_ALTERNATIVE's help text by pointing out that code patching is not only done at boot time, but also module load time. Also point out that this is the minimal possible overhead. Signed-off-by: Andrew Jones Link: https://lore.kernel.org/r/20230224154601.88163-2-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c5e42cc37604..04898c50ef1d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -379,8 +379,8 @@ config RISCV_ALTERNATIVE help This Kconfig allows the kernel to automatically patch the errata required by the execution platform at run time. The - code patching is performed once in the boot stages. It means - that the overhead from this mechanism is just taken once. + code patching overhead is minimal, as it's only done once + at boot and once on each module load. config RISCV_ALTERNATIVE_EARLY bool -- cgit v1.2.3 From a3d095ac00faee0deb575ba053f0567b0d124d0b Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 16:45:57 +0100 Subject: riscv: Rename Kconfig.erratas to Kconfig.errata Errata is already plural for erratum. Rename it to make the grammar gooder. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20230224154601.88163-3-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 +- arch/riscv/Kconfig.errata | 82 ++++++++++++++++++++++++++++++++++++++++++++++ arch/riscv/Kconfig.erratas | 82 ---------------------------------------------- 3 files changed, 83 insertions(+), 83 deletions(-) create mode 100644 arch/riscv/Kconfig.errata delete mode 100644 arch/riscv/Kconfig.erratas diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 04898c50ef1d..ab78f677f187 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -245,7 +245,7 @@ config AS_HAS_INSN def_bool $(as-instr,.insn r 51$(comma) 0$(comma) 0$(comma) t0$(comma) t0$(comma) zero) source "arch/riscv/Kconfig.socs" -source "arch/riscv/Kconfig.erratas" +source "arch/riscv/Kconfig.errata" menu "Platform type" diff --git a/arch/riscv/Kconfig.errata b/arch/riscv/Kconfig.errata new file mode 100644 index 000000000000..69621ae6d647 --- /dev/null +++ b/arch/riscv/Kconfig.errata @@ -0,0 +1,82 @@ +menu "CPU errata selection" + +config ERRATA_SIFIVE + bool "SiFive errata" + depends on !XIP_KERNEL + select RISCV_ALTERNATIVE + help + All SiFive errata Kconfig depend on this Kconfig. Disabling + this Kconfig will disable all SiFive errata. Please say "Y" + here if your platform uses SiFive CPU cores. + + Otherwise, please say "N" here to avoid unnecessary overhead. + +config ERRATA_SIFIVE_CIP_453 + bool "Apply SiFive errata CIP-453" + depends on ERRATA_SIFIVE && 64BIT + default y + help + This will apply the SiFive CIP-453 errata to add sign extension + to the $badaddr when exception type is instruction page fault + and instruction access fault. + + If you don't know what to do here, say "Y". + +config ERRATA_SIFIVE_CIP_1200 + bool "Apply SiFive errata CIP-1200" + depends on ERRATA_SIFIVE && 64BIT + default y + help + This will apply the SiFive CIP-1200 errata to repalce all + "sfence.vma addr" with "sfence.vma" to ensure that the addr + has been flushed from TLB. + + If you don't know what to do here, say "Y". + +config ERRATA_THEAD + bool "T-HEAD errata" + depends on !XIP_KERNEL + select RISCV_ALTERNATIVE + help + All T-HEAD errata Kconfig depend on this Kconfig. Disabling + this Kconfig will disable all T-HEAD errata. Please say "Y" + here if your platform uses T-HEAD CPU cores. + + Otherwise, please say "N" here to avoid unnecessary overhead. + +config ERRATA_THEAD_PBMT + bool "Apply T-Head memory type errata" + depends on ERRATA_THEAD && 64BIT && MMU + select RISCV_ALTERNATIVE_EARLY + default y + help + This will apply the memory type errata to handle the non-standard + memory type bits in page-table-entries on T-Head SoCs. + + If you don't know what to do here, say "Y". + +config ERRATA_THEAD_CMO + bool "Apply T-Head cache management errata" + depends on ERRATA_THEAD && MMU + select RISCV_DMA_NONCOHERENT + default y + help + This will apply the cache management errata to handle the + non-standard handling on non-coherent operations on T-Head SoCs. + + If you don't know what to do here, say "Y". + +config ERRATA_THEAD_PMU + bool "Apply T-Head PMU errata" + depends on ERRATA_THEAD && RISCV_PMU_SBI + default y + help + The T-Head C9xx cores implement a PMU overflow extension very + similar to the core SSCOFPMF extension. + + This will apply the overflow errata to handle the non-standard + behaviour via the regular SBI PMU driver and interface. + + If you don't know what to do here, say "Y". + +endmenu # "CPU errata selection" diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas deleted file mode 100644 index 69621ae6d647..000000000000 --- a/arch/riscv/Kconfig.erratas +++ /dev/null @@ -1,82 +0,0 @@ -menu "CPU errata selection" - -config ERRATA_SIFIVE - bool "SiFive errata" - depends on !XIP_KERNEL - select RISCV_ALTERNATIVE - help - All SiFive errata Kconfig depend on this Kconfig. Disabling - this Kconfig will disable all SiFive errata. Please say "Y" - here if your platform uses SiFive CPU cores. - - Otherwise, please say "N" here to avoid unnecessary overhead. - -config ERRATA_SIFIVE_CIP_453 - bool "Apply SiFive errata CIP-453" - depends on ERRATA_SIFIVE && 64BIT - default y - help - This will apply the SiFive CIP-453 errata to add sign extension - to the $badaddr when exception type is instruction page fault - and instruction access fault. - - If you don't know what to do here, say "Y". - -config ERRATA_SIFIVE_CIP_1200 - bool "Apply SiFive errata CIP-1200" - depends on ERRATA_SIFIVE && 64BIT - default y - help - This will apply the SiFive CIP-1200 errata to repalce all - "sfence.vma addr" with "sfence.vma" to ensure that the addr - has been flushed from TLB. - - If you don't know what to do here, say "Y". - -config ERRATA_THEAD - bool "T-HEAD errata" - depends on !XIP_KERNEL - select RISCV_ALTERNATIVE - help - All T-HEAD errata Kconfig depend on this Kconfig. Disabling - this Kconfig will disable all T-HEAD errata. Please say "Y" - here if your platform uses T-HEAD CPU cores. - - Otherwise, please say "N" here to avoid unnecessary overhead. - -config ERRATA_THEAD_PBMT - bool "Apply T-Head memory type errata" - depends on ERRATA_THEAD && 64BIT && MMU - select RISCV_ALTERNATIVE_EARLY - default y - help - This will apply the memory type errata to handle the non-standard - memory type bits in page-table-entries on T-Head SoCs. - - If you don't know what to do here, say "Y". - -config ERRATA_THEAD_CMO - bool "Apply T-Head cache management errata" - depends on ERRATA_THEAD && MMU - select RISCV_DMA_NONCOHERENT - default y - help - This will apply the cache management errata to handle the - non-standard handling on non-coherent operations on T-Head SoCs. - - If you don't know what to do here, say "Y". - -config ERRATA_THEAD_PMU - bool "Apply T-Head PMU errata" - depends on ERRATA_THEAD && RISCV_PMU_SBI - default y - help - The T-Head C9xx cores implement a PMU overflow extension very - similar to the core SSCOFPMF extension. - - This will apply the overflow errata to handle the non-standard - behaviour via the regular SBI PMU driver and interface. - - If you don't know what to do here, say "Y". - -endmenu # "CPU errata selection" -- cgit v1.2.3 From ce06b42a4a96d46d4844f2b630af7d349a924a58 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 16:45:58 +0100 Subject: riscv: alternatives: Remove unnecessary define and unused struct A define and a struct were introduced with commit 6f4eea90465a ("riscv: Introduce alternative mechanism to apply errata solution"), which introduced alternatives to RISC-V. The define is used for an arbitrary string length, specific to sifive errata, so just use the number directly there instead. The struct has never been used, so remove it. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20230224154601.88163-4-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/errata/sifive/errata.c | 2 +- arch/riscv/include/asm/alternative.h | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c index da55cb247e89..69dfb38e4f06 100644 --- a/arch/riscv/errata/sifive/errata.c +++ b/arch/riscv/errata/sifive/errata.c @@ -14,7 +14,7 @@ #include struct errata_info_t { - char name[ERRATA_STRING_LENGTH_MAX]; + char name[32]; bool (*check_func)(unsigned long arch_id, unsigned long impid); }; diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index b8648d4f2ac1..3beef400a971 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -6,8 +6,6 @@ #ifndef __ASM_ALTERNATIVE_H #define __ASM_ALTERNATIVE_H -#define ERRATA_STRING_LENGTH_MAX 32 - #include #ifndef __ASSEMBLY__ @@ -43,11 +41,6 @@ struct alt_entry { u32 errata_id; /* The errata id */ }; -struct errata_checkfunc_id { - unsigned long vendor_id; - bool (*func)(struct alt_entry *alt); -}; - void sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); -- cgit v1.2.3 From ff19a8dee196d757dbc32a946843260f0b784ca3 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 16:45:59 +0100 Subject: riscv: alternatives: Rename errata_id to patch_id Alternatives are used for both errata and cpufeatures. Use a more generic name, 'patch_id', as in "ID of code patching site", to avoid confusion when alternatives are used for cpufeatures. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20230224154601.88163-5-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 6 +-- arch/riscv/errata/sifive/errata.c | 6 +-- arch/riscv/errata/thead/errata.c | 4 +- arch/riscv/include/asm/alternative-macros.h | 72 ++++++++++++++--------------- arch/riscv/include/asm/alternative.h | 4 +- arch/riscv/kernel/cpufeature.c | 6 +-- 6 files changed, 49 insertions(+), 49 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index ab78f677f187..69797e6eed6f 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -378,9 +378,9 @@ config RISCV_ALTERNATIVE depends on !XIP_KERNEL help This Kconfig allows the kernel to automatically patch the - errata required by the execution platform at run time. The - code patching overhead is minimal, as it's only done once - at boot and once on each module load. + erratum or cpufeature required by the execution platform at run + time. The code patching overhead is minimal, as it's only done + once at boot and once on each module load. config RISCV_ALTERNATIVE_EARLY bool diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c index 69dfb38e4f06..7fa7b8b6a811 100644 --- a/arch/riscv/errata/sifive/errata.c +++ b/arch/riscv/errata/sifive/errata.c @@ -101,12 +101,12 @@ void __init_or_module sifive_errata_patch_func(struct alt_entry *begin, for (alt = begin; alt < end; alt++) { if (alt->vendor_id != SIFIVE_VENDOR_ID) continue; - if (alt->errata_id >= ERRATA_SIFIVE_NUMBER) { - WARN(1, "This errata id:%d is not in kernel errata list", alt->errata_id); + if (alt->patch_id >= ERRATA_SIFIVE_NUMBER) { + WARN(1, "This errata id:%d is not in kernel errata list", alt->patch_id); continue; } - tmp = (1U << alt->errata_id); + tmp = (1U << alt->patch_id); if (cpu_req_errata & tmp) { mutex_lock(&text_mutex); patch_text_nosync(ALT_OLD_PTR(alt), ALT_ALT_PTR(alt), diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index 3b96a06d3c54..7e8d50ebb71a 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -93,10 +93,10 @@ void __init_or_module thead_errata_patch_func(struct alt_entry *begin, struct al for (alt = begin; alt < end; alt++) { if (alt->vendor_id != THEAD_VENDOR_ID) continue; - if (alt->errata_id >= ERRATA_THEAD_NUMBER) + if (alt->patch_id >= ERRATA_THEAD_NUMBER) continue; - tmp = (1U << alt->errata_id); + tmp = (1U << alt->patch_id); if (cpu_req_errata & tmp) { oldptr = ALT_OLD_PTR(alt); altptr = ALT_ALT_PTR(alt); diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h index 51c6867e02f3..993a44a8fdac 100644 --- a/arch/riscv/include/asm/alternative-macros.h +++ b/arch/riscv/include/asm/alternative-macros.h @@ -6,18 +6,18 @@ #ifdef __ASSEMBLY__ -.macro ALT_ENTRY oldptr newptr vendor_id errata_id new_len +.macro ALT_ENTRY oldptr newptr vendor_id patch_id new_len .4byte \oldptr - . .4byte \newptr - . .2byte \vendor_id .2byte \new_len - .4byte \errata_id + .4byte \patch_id .endm -.macro ALT_NEW_CONTENT vendor_id, errata_id, enable = 1, new_c : vararg +.macro ALT_NEW_CONTENT vendor_id, patch_id, enable = 1, new_c : vararg .if \enable .pushsection .alternative, "a" - ALT_ENTRY 886b, 888f, \vendor_id, \errata_id, 889f - 888f + ALT_ENTRY 886b, 888f, \vendor_id, \patch_id, 889f - 888f .popsection .subsection 1 888 : @@ -33,7 +33,7 @@ .endif .endm -.macro ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, enable +.macro ALTERNATIVE_CFG old_c, new_c, vendor_id, patch_id, enable 886 : .option push .option norvc @@ -41,13 +41,13 @@ \old_c .option pop 887 : - ALT_NEW_CONTENT \vendor_id, \errata_id, \enable, \new_c + ALT_NEW_CONTENT \vendor_id, \patch_id, \enable, \new_c .endm -.macro ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, enable_1, \ - new_c_2, vendor_id_2, errata_id_2, enable_2 - ALTERNATIVE_CFG "\old_c", "\new_c_1", \vendor_id_1, \errata_id_1, \enable_1 - ALT_NEW_CONTENT \vendor_id_2, \errata_id_2, \enable_2, \new_c_2 +.macro ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, patch_id_1, enable_1, \ + new_c_2, vendor_id_2, patch_id_2, enable_2 + ALTERNATIVE_CFG "\old_c", "\new_c_1", \vendor_id_1, \patch_id_1, \enable_1 + ALT_NEW_CONTENT \vendor_id_2, \patch_id_2, \enable_2, \new_c_2 .endm #define __ALTERNATIVE_CFG(...) ALTERNATIVE_CFG __VA_ARGS__ @@ -58,17 +58,17 @@ #include #include -#define ALT_ENTRY(oldptr, newptr, vendor_id, errata_id, newlen) \ +#define ALT_ENTRY(oldptr, newptr, vendor_id, patch_id, newlen) \ ".4byte ((" oldptr ") - .) \n" \ ".4byte ((" newptr ") - .) \n" \ ".2byte " vendor_id "\n" \ ".2byte " newlen "\n" \ - ".4byte " errata_id "\n" + ".4byte " patch_id "\n" -#define ALT_NEW_CONTENT(vendor_id, errata_id, enable, new_c) \ +#define ALT_NEW_CONTENT(vendor_id, patch_id, enable, new_c) \ ".if " __stringify(enable) " == 1\n" \ ".pushsection .alternative, \"a\"\n" \ - ALT_ENTRY("886b", "888f", __stringify(vendor_id), __stringify(errata_id), "889f - 888f") \ + ALT_ENTRY("886b", "888f", __stringify(vendor_id), __stringify(patch_id), "889f - 888f") \ ".popsection\n" \ ".subsection 1\n" \ "888 :\n" \ @@ -83,7 +83,7 @@ ".previous\n" \ ".endif\n" -#define __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, enable) \ +#define __ALTERNATIVE_CFG(old_c, new_c, vendor_id, patch_id, enable) \ "886 :\n" \ ".option push\n" \ ".option norvc\n" \ @@ -91,22 +91,22 @@ old_c "\n" \ ".option pop\n" \ "887 :\n" \ - ALT_NEW_CONTENT(vendor_id, errata_id, enable, new_c) + ALT_NEW_CONTENT(vendor_id, patch_id, enable, new_c) -#define __ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, enable_1, \ - new_c_2, vendor_id_2, errata_id_2, enable_2) \ - __ALTERNATIVE_CFG(old_c, new_c_1, vendor_id_1, errata_id_1, enable_1) \ - ALT_NEW_CONTENT(vendor_id_2, errata_id_2, enable_2, new_c_2) +#define __ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, patch_id_1, enable_1, \ + new_c_2, vendor_id_2, patch_id_2, enable_2) \ + __ALTERNATIVE_CFG(old_c, new_c_1, vendor_id_1, patch_id_1, enable_1) \ + ALT_NEW_CONTENT(vendor_id_2, patch_id_2, enable_2, new_c_2) #endif /* __ASSEMBLY__ */ -#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ - __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k)) +#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, patch_id, CONFIG_k) \ + __ALTERNATIVE_CFG(old_c, new_c, vendor_id, patch_id, IS_ENABLED(CONFIG_k)) -#define _ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, CONFIG_k_1, \ - new_c_2, vendor_id_2, errata_id_2, CONFIG_k_2) \ - __ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, IS_ENABLED(CONFIG_k_1), \ - new_c_2, vendor_id_2, errata_id_2, IS_ENABLED(CONFIG_k_2)) +#define _ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, patch_id_1, CONFIG_k_1, \ + new_c_2, vendor_id_2, patch_id_2, CONFIG_k_2) \ + __ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, patch_id_1, IS_ENABLED(CONFIG_k_1), \ + new_c_2, vendor_id_2, patch_id_2, IS_ENABLED(CONFIG_k_2)) #else /* CONFIG_RISCV_ALTERNATIVE */ #ifdef __ASSEMBLY__ @@ -137,19 +137,19 @@ /* * Usage: - * ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k) + * ALTERNATIVE(old_content, new_content, vendor_id, patch_id, CONFIG_k) * in the assembly code. Otherwise, - * asm(ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k)); + * asm(ALTERNATIVE(old_content, new_content, vendor_id, patch_id, CONFIG_k)); * * old_content: The old content which is probably replaced with new content. * new_content: The new content. * vendor_id: The CPU vendor ID. - * errata_id: The errata ID. - * CONFIG_k: The Kconfig of this errata. When Kconfig is disabled, the old + * patch_id: The patch ID (erratum ID or cpufeature ID). + * CONFIG_k: The Kconfig of this patch ID. When Kconfig is disabled, the old * content will alwyas be executed. */ -#define ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k) \ - _ALTERNATIVE_CFG(old_content, new_content, vendor_id, errata_id, CONFIG_k) +#define ALTERNATIVE(old_content, new_content, vendor_id, patch_id, CONFIG_k) \ + _ALTERNATIVE_CFG(old_content, new_content, vendor_id, patch_id, CONFIG_k) /* * A vendor wants to replace an old_content, but another vendor has used @@ -158,9 +158,9 @@ * on the following sample code and then replace ALTERNATIVE() with * ALTERNATIVE_2() to append its customized content. */ -#define ALTERNATIVE_2(old_content, new_content_1, vendor_id_1, errata_id_1, CONFIG_k_1, \ - new_content_2, vendor_id_2, errata_id_2, CONFIG_k_2) \ - _ALTERNATIVE_CFG_2(old_content, new_content_1, vendor_id_1, errata_id_1, CONFIG_k_1, \ - new_content_2, vendor_id_2, errata_id_2, CONFIG_k_2) +#define ALTERNATIVE_2(old_content, new_content_1, vendor_id_1, patch_id_1, CONFIG_k_1, \ + new_content_2, vendor_id_2, patch_id_2, CONFIG_k_2) \ + _ALTERNATIVE_CFG_2(old_content, new_content_1, vendor_id_1, patch_id_1, CONFIG_k_1, \ + new_content_2, vendor_id_2, patch_id_2, CONFIG_k_2) #endif diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index 3beef400a971..c8dea9e94310 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -36,9 +36,9 @@ void riscv_alternative_fix_offsets(void *alt_ptr, unsigned int len, struct alt_entry { s32 old_offset; /* offset relative to original instruction or data */ s32 alt_offset; /* offset relative to replacement instruction or data */ - u16 vendor_id; /* cpu vendor id */ + u16 vendor_id; /* CPU vendor ID */ u16 alt_len; /* The replacement size */ - u32 errata_id; /* The errata id */ + u32 patch_id; /* The patch ID (erratum ID or cpufeature ID) */ }; void sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 59d58ee0f68d..059db20b28ca 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -282,13 +282,13 @@ void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin, for (alt = begin; alt < end; alt++) { if (alt->vendor_id != 0) continue; - if (alt->errata_id >= RISCV_ISA_EXT_MAX) { + if (alt->patch_id >= RISCV_ISA_EXT_MAX) { WARN(1, "This extension id:%d is not in ISA extension list", - alt->errata_id); + alt->patch_id); continue; } - if (!__riscv_isa_extension_available(NULL, alt->errata_id)) + if (!__riscv_isa_extension_available(NULL, alt->patch_id)) continue; oldptr = ALT_OLD_PTR(alt); -- cgit v1.2.3 From 28ea374da1d94dfe1253b83aa2d16b5a97fb7c4e Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 16:46:00 +0100 Subject: riscv: lib: Include hwcap.h directly When using alternatives for cpufeatures we should include hwcap.h directly, rather than through errata_list.h. Opportunistically drop an unused include too. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20230224154601.88163-6-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/lib/strcmp.S | 3 +-- arch/riscv/lib/strlen.S | 3 +-- arch/riscv/lib/strncmp.S | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S index c42a8412547f..687b2bea5c43 100644 --- a/arch/riscv/lib/strcmp.S +++ b/arch/riscv/lib/strcmp.S @@ -2,9 +2,8 @@ #include #include -#include #include -#include +#include /* int strcmp(const char *cs, const char *ct) */ SYM_FUNC_START(strcmp) diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S index 15bb8f3aa959..db3d42d99b78 100644 --- a/arch/riscv/lib/strlen.S +++ b/arch/riscv/lib/strlen.S @@ -2,9 +2,8 @@ #include #include -#include #include -#include +#include /* int strlen(const char *s) */ SYM_FUNC_START(strlen) diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S index 7ac2f667285a..aba5b3148621 100644 --- a/arch/riscv/lib/strncmp.S +++ b/arch/riscv/lib/strncmp.S @@ -2,9 +2,8 @@ #include #include -#include #include -#include +#include /* int strncmp(const char *cs, const char *ct, size_t count) */ SYM_FUNC_START(strncmp) -- cgit v1.2.3 From 816a69744102bc886edc3557d00e840e3e35e7d5 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 16:46:01 +0100 Subject: riscv: cpufeature: Drop errata_list.h and other unused includes Drop errata_list.h, since cpufeature.c includes hwcap.h directly to get cpufeature IDs. And, while there, prune the rest of the unused includes too. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20230224154601.88163-7-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpufeature.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 059db20b28ca..6569d963fc7d 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -8,20 +8,15 @@ #include #include -#include #include #include #include #include #include #include -#include #include #include -#include #include -#include -#include #define NUM_ALPHA_EXTS ('z' - 'a' + 1) -- cgit v1.2.3 From 0b2f658f5370d9818244682c76fd8f6a91b2b1af Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 17:26:24 +0100 Subject: RISC-V: alternatives: Support patching multiple insns in assembly As pointed out in commit d374a16539b1 ("RISC-V: fix compile error from deduplicated __ALTERNATIVE_CFG_2"), we need quotes around parameters passed to macros within macros to avoid spaces being interpreted as separators. ALT_NEW_CONTENT was trying to handle this by defining new_c has a vararg, but this isn't sufficient for calling ALTERNATIVE() from assembly with multiple instructions in the new/old sequences. Remove the vararg "hack" and use quotes. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230224162631.405473-2-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/alternative-macros.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h index 993a44a8fdac..b8c55fb3ab2c 100644 --- a/arch/riscv/include/asm/alternative-macros.h +++ b/arch/riscv/include/asm/alternative-macros.h @@ -14,7 +14,7 @@ .4byte \patch_id .endm -.macro ALT_NEW_CONTENT vendor_id, patch_id, enable = 1, new_c : vararg +.macro ALT_NEW_CONTENT vendor_id, patch_id, enable = 1, new_c .if \enable .pushsection .alternative, "a" ALT_ENTRY 886b, 888f, \vendor_id, \patch_id, 889f - 888f @@ -41,13 +41,13 @@ \old_c .option pop 887 : - ALT_NEW_CONTENT \vendor_id, \patch_id, \enable, \new_c + ALT_NEW_CONTENT \vendor_id, \patch_id, \enable, "\new_c" .endm .macro ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, patch_id_1, enable_1, \ new_c_2, vendor_id_2, patch_id_2, enable_2 ALTERNATIVE_CFG "\old_c", "\new_c_1", \vendor_id_1, \patch_id_1, \enable_1 - ALT_NEW_CONTENT \vendor_id_2, \patch_id_2, \enable_2, \new_c_2 + ALT_NEW_CONTENT \vendor_id_2, \patch_id_2, \enable_2, "\new_c_2" .endm #define __ALTERNATIVE_CFG(...) ALTERNATIVE_CFG __VA_ARGS__ -- cgit v1.2.3 From 8b05e7d0408ac99e2f05f3673f2f249e9a9e10ec Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 17:26:25 +0100 Subject: RISC-V: Factor out body of riscv_init_cbom_blocksize loop Refactor riscv_init_cbom_blocksize() to prepare for it to be used for both cbom block size and cboz block size. Signed-off-by: Andrew Jones Reviewed-by: Heiko Stuebner Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230224162631.405473-3-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/cacheflush.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index fcd6145fbead..9f37c7a330bf 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -100,34 +100,39 @@ void flush_icache_pte(pte_t pte) unsigned int riscv_cbom_block_size; EXPORT_SYMBOL_GPL(riscv_cbom_block_size); +static void cbo_get_block_size(struct device_node *node, + const char *name, u32 *block_size, + unsigned long *first_hartid) +{ + unsigned long hartid; + u32 val; + + if (riscv_of_processor_hartid(node, &hartid)) + return; + + if (of_property_read_u32(node, name, &val)) + return; + + if (!*block_size) { + *block_size = val; + *first_hartid = hartid; + } else if (*block_size != val) { + pr_warn("%s mismatched between harts %lu and %lu\n", + name, *first_hartid, hartid); + } +} + void riscv_init_cbom_blocksize(void) { struct device_node *node; unsigned long cbom_hartid; - u32 val, probed_block_size; - int ret; + u32 probed_block_size; probed_block_size = 0; for_each_of_cpu_node(node) { - unsigned long hartid; - - ret = riscv_of_processor_hartid(node, &hartid); - if (ret) - continue; - /* set block-size for cbom extension if available */ - ret = of_property_read_u32(node, "riscv,cbom-block-size", &val); - if (ret) - continue; - - if (!probed_block_size) { - probed_block_size = val; - cbom_hartid = hartid; - } else { - if (probed_block_size != val) - pr_warn("cbom-block-size mismatched between harts %lu and %lu\n", - cbom_hartid, hartid); - } + cbo_get_block_size(node, "riscv,cbom-block-size", + &probed_block_size, &cbom_hartid); } if (probed_block_size) -- cgit v1.2.3 From ea20f117ab99d7c7653df656ddb795e51d9f7733 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 17:26:26 +0100 Subject: dt-bindings: riscv: Document cboz-block-size The Zicboz operation (cbo.zero) operates on a block-size defined for the cpu-core. While we already have the riscv,cbom-block-size property, it only provides the block size for Zicbom operations. Even though it's likely Zicboz and Zicbom will use the same size, that's not required by the specification. Create another property specifically for Zicboz. Cc: Rob Herring Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20230224162631.405473-4-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/riscv/cpus.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/riscv/cpus.yaml b/Documentation/devicetree/bindings/riscv/cpus.yaml index 001931d526ec..f24cf9601c6e 100644 --- a/Documentation/devicetree/bindings/riscv/cpus.yaml +++ b/Documentation/devicetree/bindings/riscv/cpus.yaml @@ -72,6 +72,11 @@ properties: description: The blocksize in bytes for the Zicbom cache operations. + riscv,cboz-block-size: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + The blocksize in bytes for the Zicboz cache operations. + riscv,isa: description: Identifies the specific RISC-V instruction set architecture -- cgit v1.2.3 From 7ea5a73617e931230a46150dc7c1bbfd98f24c8e Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 17:26:27 +0100 Subject: RISC-V: Add Zicboz detection and block size parsing Parse "riscv,cboz-block-size" from the DT by piggybacking on Zicbom's riscv_init_cbom_blocksize(). Additionally check the DT for the presence of the "zicboz" extension and, when it's present, validate the parsed cboz block size as we do Zicbom's cbom block size with riscv_isa_extension_check(). Signed-off-by: Andrew Jones Reviewed-by: Heiko Stuebner Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230224162631.405473-5-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cacheflush.h | 3 ++- arch/riscv/include/asm/hwcap.h | 1 + arch/riscv/kernel/cpu.c | 1 + arch/riscv/kernel/cpufeature.c | 10 ++++++++++ arch/riscv/kernel/setup.c | 2 +- arch/riscv/mm/cacheflush.c | 23 +++++++++++++++-------- 6 files changed, 30 insertions(+), 10 deletions(-) diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 03e3b95ae6da..8091b8bf4883 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -50,7 +50,8 @@ void flush_icache_mm(struct mm_struct *mm, bool local); #endif /* CONFIG_SMP */ extern unsigned int riscv_cbom_block_size; -void riscv_init_cbom_blocksize(void); +extern unsigned int riscv_cboz_block_size; +void riscv_init_cbo_blocksizes(void); #ifdef CONFIG_RISCV_DMA_NONCOHERENT void riscv_noncoherent_supported(void); diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index e3021b2590de..5955fbeaba86 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -42,6 +42,7 @@ #define RISCV_ISA_EXT_ZBB 30 #define RISCV_ISA_EXT_ZICBOM 31 #define RISCV_ISA_EXT_ZIHINTPAUSE 32 +#define RISCV_ISA_EXT_ZICBOZ 33 #define RISCV_ISA_EXT_MAX 64 #define RISCV_ISA_EXT_NAME_LEN_MAX 32 diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index 8400f0cc9704..1b0411280141 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -186,6 +186,7 @@ arch_initcall(riscv_cpuinfo_init); */ static struct riscv_isa_ext_data isa_ext_arr[] = { __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM), + __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ), __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE), __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB), __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF), diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 6569d963fc7d..538779d03311 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -74,6 +74,15 @@ static bool riscv_isa_extension_check(int id) return false; } return true; + case RISCV_ISA_EXT_ZICBOZ: + if (!riscv_cboz_block_size) { + pr_err("Zicboz detected in ISA string, but no cboz-block-size found\n"); + return false; + } else if (!is_power_of_2(riscv_cboz_block_size)) { + pr_err("cboz-block-size present, but is not a power-of-2\n"); + return false; + } + return true; } return true; @@ -222,6 +231,7 @@ void __init riscv_fill_hwcap(void) SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT); SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB); SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM); + SET_ISA_EXT_MAP("zicboz", RISCV_ISA_EXT_ZICBOZ); SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE); } #undef SET_ISA_EXT_MAP diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 376d2827e736..5d3184cbf518 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -297,7 +297,7 @@ void __init setup_arch(char **cmdline_p) setup_smp(); #endif - riscv_init_cbom_blocksize(); + riscv_init_cbo_blocksizes(); riscv_fill_hwcap(); apply_boot_alternatives(); if (IS_ENABLED(CONFIG_RISCV_ISA_ZICBOM) && diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 9f37c7a330bf..632d6d06148a 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -100,6 +100,9 @@ void flush_icache_pte(pte_t pte) unsigned int riscv_cbom_block_size; EXPORT_SYMBOL_GPL(riscv_cbom_block_size); +unsigned int riscv_cboz_block_size; +EXPORT_SYMBOL_GPL(riscv_cboz_block_size); + static void cbo_get_block_size(struct device_node *node, const char *name, u32 *block_size, unsigned long *first_hartid) @@ -122,19 +125,23 @@ static void cbo_get_block_size(struct device_node *node, } } -void riscv_init_cbom_blocksize(void) +void riscv_init_cbo_blocksizes(void) { + unsigned long cbom_hartid, cboz_hartid; + u32 cbom_block_size = 0, cboz_block_size = 0; struct device_node *node; - unsigned long cbom_hartid; - u32 probed_block_size; - probed_block_size = 0; for_each_of_cpu_node(node) { - /* set block-size for cbom extension if available */ + /* set block-size for cbom and/or cboz extension if available */ cbo_get_block_size(node, "riscv,cbom-block-size", - &probed_block_size, &cbom_hartid); + &cbom_block_size, &cbom_hartid); + cbo_get_block_size(node, "riscv,cboz-block-size", + &cboz_block_size, &cboz_hartid); } - if (probed_block_size) - riscv_cbom_block_size = probed_block_size; + if (cbom_block_size) + riscv_cbom_block_size = cbom_block_size; + + if (cboz_block_size) + riscv_cboz_block_size = cboz_block_size; } -- cgit v1.2.3 From d25f256332cc795b0fe16ba541fe0c05f0eb2c59 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 17:26:28 +0100 Subject: RISC-V: cpufeatures: Put the upper 16 bits of patch ID to work cpufeature IDs are consecutive integers starting at 26, so a 32-bit patch ID allows an aircraft carrier load of feature IDs. Repurposing the upper 16 bits still leaves a boat load of feature IDs and gains 16 bits which may be used to control patching on a per patch-site basis. This will be initially used in Zicboz's application to clear_page(), as Zicboz's block size must also be considered. In that case, the upper 16-bit value's role will be to convey the maximum block size which the Zicboz clear_page() implementation supports. cpufeature patch sites which need to check for the existence or absence of other cpufeatures may also be able to make use of this. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230224162631.405473-6-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/alternative.h | 4 ++++ arch/riscv/kernel/cpufeature.c | 37 ++++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index c8dea9e94310..58ccd2f8cab7 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -13,10 +13,14 @@ #ifdef CONFIG_RISCV_ALTERNATIVE #include +#include #include #include #include +#define PATCH_ID_CPUFEATURE_ID(p) lower_16_bits(p) +#define PATCH_ID_CPUFEATURE_VALUE(p) upper_16_bits(p) + #define RISCV_ALTERNATIVES_BOOT 0 /* alternatives applied during regular boot */ #define RISCV_ALTERNATIVES_MODULE 1 /* alternatives applied during module-init */ #define RISCV_ALTERNATIVES_EARLY_BOOT 2 /* alternatives applied before mmu start */ diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 538779d03311..d424cd76beb1 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -274,12 +274,35 @@ void __init riscv_fill_hwcap(void) } #ifdef CONFIG_RISCV_ALTERNATIVE +/* + * Alternative patch sites consider 48 bits when determining when to patch + * the old instruction sequence with the new. These bits are broken into a + * 16-bit vendor ID and a 32-bit patch ID. A non-zero vendor ID means the + * patch site is for an erratum, identified by the 32-bit patch ID. When + * the vendor ID is zero, the patch site is for a cpufeature. cpufeatures + * further break down patch ID into two 16-bit numbers. The lower 16 bits + * are the cpufeature ID and the upper 16 bits are used for a value specific + * to the cpufeature and patch site. If the upper 16 bits are zero, then it + * implies no specific value is specified. cpufeatures that want to control + * patching on a per-site basis will provide non-zero values and implement + * checks here. The checks return true when patching should be done, and + * false otherwise. + */ +static bool riscv_cpufeature_patch_check(u16 id, u16 value) +{ + if (!value) + return true; + + return false; +} + void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned int stage) { struct alt_entry *alt; void *oldptr, *altptr; + u16 id, value; if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) return; @@ -287,13 +310,19 @@ void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin, for (alt = begin; alt < end; alt++) { if (alt->vendor_id != 0) continue; - if (alt->patch_id >= RISCV_ISA_EXT_MAX) { - WARN(1, "This extension id:%d is not in ISA extension list", - alt->patch_id); + + id = PATCH_ID_CPUFEATURE_ID(alt->patch_id); + + if (id >= RISCV_ISA_EXT_MAX) { + WARN(1, "This extension id:%d is not in ISA extension list", id); continue; } - if (!__riscv_isa_extension_available(NULL, alt->patch_id)) + if (!__riscv_isa_extension_available(NULL, id)) + continue; + + value = PATCH_ID_CPUFEATURE_VALUE(alt->patch_id); + if (!riscv_cpufeature_patch_check(id, value)) continue; oldptr = ALT_OLD_PTR(alt); -- cgit v1.2.3 From ab0f77465e3e4ec2d2583cd770b157b16cc22844 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 17:26:29 +0100 Subject: RISC-V: Use Zicboz in clear_page when available Using memset() to zero a 4K page takes 563 total instructions, where 20 are branches. clear_page(), with Zicboz and a 64 byte block size, takes 169 total instructions, where 4 are branches and 33 are nops. Even though the block size is a variable, thanks to alternatives, we can still implement a Duff device without having to do any preliminary calculations. This is achieved by using the alternatives' cpufeature value (the upper 16 bits of patch_id). The value used is the maximum zicboz block size order accepted at the patch site. This enables us to stop patching / unrolling when 4K bytes have been zeroed (we would loop and continue after 4K if the page size would be larger) For 4K pages, unrolling 16 times allows block sizes of 64 and 128 to only loop a few times and larger block sizes to not loop at all. Since cbo.zero doesn't take an offset, we also need an 'add' after each instruction, making the loop body 112 to 160 bytes. Hopefully this is small enough to not cause icache misses. Signed-off-by: Andrew Jones Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20230224162631.405473-7-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 13 +++++++ arch/riscv/include/asm/insn-def.h | 4 +++ arch/riscv/include/asm/page.h | 6 +++- arch/riscv/kernel/cpufeature.c | 11 ++++++ arch/riscv/lib/Makefile | 1 + arch/riscv/lib/clear_page.S | 74 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/lib/clear_page.S diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 69797e6eed6f..275be2ee87bf 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -457,6 +457,19 @@ config RISCV_ISA_ZICBOM If you don't know what to do here, say Y. +config RISCV_ISA_ZICBOZ + bool "Zicboz extension support for faster zeroing of memory" + depends on !XIP_KERNEL && MMU + select RISCV_ALTERNATIVE + default y + help + Enable the use of the ZICBOZ extension (cbo.zero instruction) + when available. + + The Zicboz extension is used for faster zeroing of memory. + + If you don't know what to do here, say Y. + config TOOLCHAIN_HAS_ZIHINTPAUSE bool default y diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h index e01ab51f50d2..6960beb75f32 100644 --- a/arch/riscv/include/asm/insn-def.h +++ b/arch/riscv/include/asm/insn-def.h @@ -192,4 +192,8 @@ INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ RS1(base), SIMM12(2)) +#define CBO_zero(base) \ + INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ + RS1(base), SIMM12(4)) + #endif /* __ASM_INSN_DEF_H */ diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 7fed7c431928..21fdf9527ce8 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -49,10 +49,14 @@ #ifndef __ASSEMBLY__ +#ifdef CONFIG_RISCV_ISA_ZICBOZ +void clear_page(void *page); +#else #define clear_page(pgaddr) memset((pgaddr), 0, PAGE_SIZE) +#endif #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) -#define clear_user_page(pgaddr, vaddr, page) memset((pgaddr), 0, PAGE_SIZE) +#define clear_user_page(pgaddr, vaddr, page) clear_page(pgaddr) #define copy_user_page(vto, vfrom, vaddr, topg) \ memcpy((vto), (vfrom), PAGE_SIZE) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index d424cd76beb1..8e7b0d703841 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -293,6 +293,17 @@ static bool riscv_cpufeature_patch_check(u16 id, u16 value) if (!value) return true; + switch (id) { + case RISCV_ISA_EXT_ZICBOZ: + /* + * Zicboz alternative applications provide the maximum + * supported block size order, or zero when it doesn't + * matter. If the current block size exceeds the maximum, + * then the alternative cannot be applied. + */ + return riscv_cboz_block_size <= (1U << value); + } + return false; } diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 6c74b0bedd60..26cb2502ecf8 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -8,5 +8,6 @@ lib-y += strlen.o lib-y += strncmp.o lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o +lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/riscv/lib/clear_page.S b/arch/riscv/lib/clear_page.S new file mode 100644 index 000000000000..d7a256eb53f4 --- /dev/null +++ b/arch/riscv/lib/clear_page.S @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2023 Ventana Micro Systems Inc. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define CBOZ_ALT(order, old, new) \ + ALTERNATIVE(old, new, 0, \ + ((order) << 16) | RISCV_ISA_EXT_ZICBOZ, \ + CONFIG_RISCV_ISA_ZICBOZ) + +/* void clear_page(void *page) */ +SYM_FUNC_START(clear_page) + li a2, PAGE_SIZE + + /* + * If Zicboz isn't present, or somehow has a block + * size larger than 4K, then fallback to memset. + */ + CBOZ_ALT(12, "j .Lno_zicboz", "nop") + + lw a1, riscv_cboz_block_size + add a2, a0, a2 +.Lzero_loop: + CBO_zero(a0) + add a0, a0, a1 + CBOZ_ALT(11, "bltu a0, a2, .Lzero_loop; ret", "nop; nop") + CBO_zero(a0) + add a0, a0, a1 + CBOZ_ALT(10, "bltu a0, a2, .Lzero_loop; ret", "nop; nop") + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + CBOZ_ALT(9, "bltu a0, a2, .Lzero_loop; ret", "nop; nop") + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + CBOZ_ALT(8, "bltu a0, a2, .Lzero_loop; ret", "nop; nop") + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + CBO_zero(a0) + add a0, a0, a1 + bltu a0, a2, .Lzero_loop + ret +.Lno_zicboz: + li a1, 0 + tail __memset +SYM_FUNC_END(clear_page) +EXPORT_SYMBOL(clear_page) -- cgit v1.2.3 From 665fd8862413f21cd924fc121eadcd32ea122c52 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 17:26:30 +0100 Subject: RISC-V: KVM: Provide UAPI for Zicboz block size We're about to allow guests to use the Zicboz extension. KVM userspace needs to know the cache block size in order to properly advertise it to the guest. Provide a virtual config register for userspace to get it with the GET_ONE_REG API, but setting it cannot be supported, so disallow SET_ONE_REG. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20230224162631.405473-8-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/uapi/asm/kvm.h | 1 + arch/riscv/kvm/vcpu.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 92af6f3f057c..c1a1bb0fa91c 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -52,6 +52,7 @@ struct kvm_riscv_config { unsigned long mvendorid; unsigned long marchid; unsigned long mimpid; + unsigned long zicboz_block_size; }; /* CORE registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 7d010b0be54e..525d785ccba2 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -283,6 +283,11 @@ static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu, return -EINVAL; reg_val = riscv_cbom_block_size; break; + case KVM_REG_RISCV_CONFIG_REG(zicboz_block_size): + if (!riscv_isa_extension_available(vcpu->arch.isa, ZICBOZ)) + return -EINVAL; + reg_val = riscv_cboz_block_size; + break; case KVM_REG_RISCV_CONFIG_REG(mvendorid): reg_val = vcpu->arch.mvendorid; break; @@ -354,6 +359,8 @@ static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu, break; case KVM_REG_RISCV_CONFIG_REG(zicbom_block_size): return -EOPNOTSUPP; + case KVM_REG_RISCV_CONFIG_REG(zicboz_block_size): + return -EOPNOTSUPP; case KVM_REG_RISCV_CONFIG_REG(mvendorid): if (!vcpu->arch.ran_atleast_once) vcpu->arch.mvendorid = reg_val; -- cgit v1.2.3 From b20f67994f35d75758ff671cd5095ec0cfab6ff9 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 24 Feb 2023 17:26:31 +0100 Subject: RISC-V: KVM: Expose Zicboz to the guest Guests may use the cbo.zero instruction when the CPU has the Zicboz extension and the hypervisor sets henvcfg.CBZE. Add Zicboz support for KVM guests which may be enabled and disabled from KVM userspace using the ISA extension ONE_REG API. Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20230224162631.405473-9-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/uapi/asm/kvm.h | 1 + arch/riscv/kvm/vcpu.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index c1a1bb0fa91c..e44c1e90eaa7 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -106,6 +106,7 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_SVINVAL, KVM_RISCV_ISA_EXT_ZIHINTPAUSE, KVM_RISCV_ISA_EXT_ZICBOM, + KVM_RISCV_ISA_EXT_ZICBOZ, KVM_RISCV_ISA_EXT_MAX, }; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 525d785ccba2..6adb1b6112a1 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -63,6 +63,7 @@ static const unsigned long kvm_isa_ext_arr[] = { KVM_ISA_EXT_ARR(SVPBMT), KVM_ISA_EXT_ARR(ZIHINTPAUSE), KVM_ISA_EXT_ARR(ZICBOM), + KVM_ISA_EXT_ARR(ZICBOZ), }; static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext) @@ -872,6 +873,9 @@ static void kvm_riscv_vcpu_update_config(const unsigned long *isa) if (riscv_isa_extension_available(isa, ZICBOM)) henvcfg |= (ENVCFG_CBIE | ENVCFG_CBCFE); + if (riscv_isa_extension_available(isa, ZICBOZ)) + henvcfg |= ENVCFG_CBZE; + csr_write(CSR_HENVCFG, henvcfg); #ifdef CONFIG_32BIT csr_write(CSR_HENVCFGH, henvcfg >> 32); -- cgit v1.2.3 From 8574bf8d0ddd46f0bc393381563f59a6f4418a3e Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 21 Feb 2023 22:30:16 -0500 Subject: riscv: ptrace: Remove duplicate operation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TIF_SYSCALL_TRACE is controlled by a common code, see kernel/ptrace.c and include/linux/thread_info.h. clear_task_syscall_work(child, SYSCALL_TRACE); Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Oleg Nesterov Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20230222033021.983168-3-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/ptrace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 2ae8280ae475..44f4b1ca315d 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -212,7 +212,6 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) void ptrace_disable(struct task_struct *child) { - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); } long arch_ptrace(struct task_struct *child, long request, -- cgit v1.2.3 From d0db02c628793e389538c0e5c3f6491141ea6200 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 21 Feb 2023 22:30:17 -0500 Subject: riscv: entry: Add noinstr to prevent instrumentation inserted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without noinstr the compiler is free to insert instrumentation (think all the k*SAN, KCov, GCov, ftrace etc..) which can call code we're not yet ready to run this early in the entry path, for instance it could rely on RCU which isn't on yet, or expect lockdep state. (by peterz) Link: https://lore.kernel.org/linux-riscv/YxcQ6NoPf3AH0EXe@hirez.programming.kicks-ass.net/ Reviewed-by: Björn Töpel Suggested-by: Peter Zijlstra Tested-by: Jisheng Zhang Signed-off-by: Guo Ren Signed-off-by: Guo Ren Link: https://lore.kernel.org/r/20230222033021.983168-4-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index f6fda94e8e59..02727a4fe8be 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -119,9 +119,9 @@ static void do_trap_error(struct pt_regs *regs, int signo, int code, } #if defined(CONFIG_XIP_KERNEL) && defined(CONFIG_RISCV_ALTERNATIVE) -#define __trap_section __section(".xip.traps") +#define __trap_section __noinstr_section(".xip.traps") #else -#define __trap_section +#define __trap_section noinstr #endif #define DO_ERROR_INFO(name, signo, code, str) \ asmlinkage __visible __trap_section void name(struct pt_regs *regs) \ -- cgit v1.2.3 From f0bddf50586da81360627a772be0e355b62f071e Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 21 Feb 2023 22:30:18 -0500 Subject: riscv: entry: Convert to generic entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch converts riscv to use the generic entry infrastructure from kernel/entry/*. The generic entry makes maintainers' work easier and codes more elegant. Here are the changes: - More clear entry.S with handle_exception and ret_from_exception - Get rid of complex custom signal implementation - Move syscall procedure from assembly to C, which is much more readable. - Connect ret_from_fork & ret_from_kernel_thread to generic entry. - Wrap with irqentry_enter/exit and syscall_enter/exit_from_user_mode - Use the standard preemption code instead of custom Suggested-by: Huacai Chen Reviewed-by: Björn Töpel Tested-by: Yipeng Zou Tested-by: Jisheng Zhang Signed-off-by: Guo Ren Signed-off-by: Guo Ren Cc: Ben Hutchings Link: https://lore.kernel.org/r/20230222033021.983168-5-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/asm-prototypes.h | 2 + arch/riscv/include/asm/csr.h | 1 - arch/riscv/include/asm/entry-common.h | 11 ++ arch/riscv/include/asm/ptrace.h | 10 +- arch/riscv/include/asm/stacktrace.h | 5 + arch/riscv/include/asm/syscall.h | 21 +++ arch/riscv/include/asm/thread_info.h | 13 +- arch/riscv/kernel/entry.S | 242 +++++--------------------------- arch/riscv/kernel/head.h | 1 - arch/riscv/kernel/ptrace.c | 43 ------ arch/riscv/kernel/signal.c | 29 +--- arch/riscv/kernel/traps.c | 140 +++++++++++++++--- arch/riscv/mm/fault.c | 6 +- 14 files changed, 210 insertions(+), 315 deletions(-) create mode 100644 arch/riscv/include/asm/entry-common.h diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c5e42cc37604..41a9a0d5afa5 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -60,6 +60,7 @@ config RISCV select GENERIC_ATOMIC64 if !64BIT select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_EARLY_IOREMAP + select GENERIC_ENTRY select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO select GENERIC_IDLE_POLL_SETUP select GENERIC_IOREMAP if MMU diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index ef386fcf3939..61ba8ed43d8f 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -27,5 +27,7 @@ DECLARE_DO_ERROR_INFO(do_trap_break); asmlinkage unsigned long get_overflow_stack(void); asmlinkage void handle_bad_stack(struct pt_regs *regs); +asmlinkage void do_page_fault(struct pt_regs *regs); +asmlinkage void do_irq(struct pt_regs *regs); #endif /* _ASM_RISCV_PROTOTYPES_H */ diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 0e571f6483d9..7c2b8cdb7b77 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -40,7 +40,6 @@ #define SR_UXL _AC(0x300000000, UL) /* XLEN mask for U-mode */ #define SR_UXL_32 _AC(0x100000000, UL) /* XLEN = 32 for U-mode */ #define SR_UXL_64 _AC(0x200000000, UL) /* XLEN = 64 for U-mode */ -#define SR_UXL_SHIFT 32 #endif /* SATP flags */ diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h new file mode 100644 index 000000000000..6e4dee49d84b --- /dev/null +++ b/arch/riscv/include/asm/entry-common.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_RISCV_ENTRY_COMMON_H +#define _ASM_RISCV_ENTRY_COMMON_H + +#include + +void handle_page_fault(struct pt_regs *regs); +void handle_break(struct pt_regs *regs); + +#endif /* _ASM_RISCV_ENTRY_COMMON_H */ diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index 6ecd461129d2..b5b0adcc85c1 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -53,6 +53,9 @@ struct pt_regs { unsigned long orig_a0; }; +#define PTRACE_SYSEMU 0x1f +#define PTRACE_SYSEMU_SINGLESTEP 0x20 + #ifdef CONFIG_64BIT #define REG_FMT "%016lx" #else @@ -121,8 +124,6 @@ extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, unsigned long frame_pointer); -int do_syscall_trace_enter(struct pt_regs *regs); -void do_syscall_trace_exit(struct pt_regs *regs); /** * regs_get_register() - get register value from its offset @@ -172,6 +173,11 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, return 0; } +static inline int regs_irqs_disabled(struct pt_regs *regs) +{ + return !(regs->status & SR_PIE); +} + #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_PTRACE_H */ diff --git a/arch/riscv/include/asm/stacktrace.h b/arch/riscv/include/asm/stacktrace.h index 3450c1912afd..f7e8ef2418b9 100644 --- a/arch/riscv/include/asm/stacktrace.h +++ b/arch/riscv/include/asm/stacktrace.h @@ -16,4 +16,9 @@ extern void notrace walk_stackframe(struct task_struct *task, struct pt_regs *re extern void dump_backtrace(struct pt_regs *regs, struct task_struct *task, const char *loglvl); +static inline bool on_thread_stack(void) +{ + return !(((unsigned long)(current->stack) ^ current_stack_pointer) & ~(THREAD_SIZE - 1)); +} + #endif /* _ASM_RISCV_STACKTRACE_H */ diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h index 384a63b86420..736110e1fd78 100644 --- a/arch/riscv/include/asm/syscall.h +++ b/arch/riscv/include/asm/syscall.h @@ -74,5 +74,26 @@ static inline int syscall_get_arch(struct task_struct *task) #endif } +typedef long (*syscall_t)(ulong, ulong, ulong, ulong, ulong, ulong, ulong); +static inline void syscall_handler(struct pt_regs *regs, ulong syscall) +{ + syscall_t fn; + +#ifdef CONFIG_COMPAT + if ((regs->status & SR_UXL) == SR_UXL_32) + fn = compat_sys_call_table[syscall]; + else +#endif + fn = sys_call_table[syscall]; + + regs->a0 = fn(regs->orig_a0, regs->a1, regs->a2, + regs->a3, regs->a4, regs->a5, regs->a6); +} + +static inline bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) +{ + return false; +} + asmlinkage long sys_riscv_flush_icache(uintptr_t, uintptr_t, uintptr_t); #endif /* _ASM_RISCV_SYSCALL_H */ diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index f704c8dd57e0..e0d202134b44 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -67,6 +67,7 @@ struct thread_info { long kernel_sp; /* Kernel stack pointer */ long user_sp; /* User stack pointer */ int cpu; + unsigned long syscall_work; /* SYSCALL_WORK_ flags */ }; /* @@ -89,26 +90,18 @@ struct thread_info { * - pending work-to-be-done flags are in lowest half-word * - other flags in upper half-word(s) */ -#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_RESTORE_SIGMASK 4 /* restore signal mask in do_signal() */ #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ -#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ -#define TIF_SYSCALL_AUDIT 7 /* syscall auditing */ -#define TIF_SECCOMP 8 /* syscall secure computing */ #define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */ #define TIF_UPROBE 10 /* uprobe breakpoint or singlestep */ #define TIF_32BIT 11 /* compat-mode 32bit process */ -#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) -#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) -#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_UPROBE (1 << TIF_UPROBE) @@ -116,8 +109,4 @@ struct thread_info { (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED | \ _TIF_NOTIFY_SIGNAL | _TIF_UPROBE) -#define _TIF_SYSCALL_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP) - #endif /* _ASM_RISCV_THREAD_INFO_H */ diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 99d38fdf8b18..bc322f92ba34 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -14,11 +14,7 @@ #include #include -#if !IS_ENABLED(CONFIG_PREEMPTION) -.set resume_kernel, restore_all -#endif - -ENTRY(handle_exception) +SYM_CODE_START(handle_exception) /* * If coming from userspace, preserve the user thread pointer and load * the kernel thread pointer. If we came from the kernel, the scratch @@ -106,19 +102,8 @@ _save_context: .option norelax la gp, __global_pointer$ .option pop - -#ifdef CONFIG_TRACE_IRQFLAGS - call __trace_hardirqs_off -#endif - -#ifdef CONFIG_CONTEXT_TRACKING_USER - /* If previous state is in user mode, call user_exit_callable(). */ - li a0, SR_PP - and a0, s1, a0 - bnez a0, skip_context_tracking - call user_exit_callable -skip_context_tracking: -#endif + move a0, sp /* pt_regs */ + la ra, ret_from_exception /* * MSB of cause differentiates between @@ -126,38 +111,13 @@ skip_context_tracking: */ bge s4, zero, 1f - la ra, ret_from_exception - /* Handle interrupts */ - move a0, sp /* pt_regs */ - la a1, generic_handle_arch_irq - jr a1 -1: - /* - * Exceptions run with interrupts enabled or disabled depending on the - * state of SR_PIE in m/sstatus. - */ - andi t0, s1, SR_PIE - beqz t0, 1f - /* kprobes, entered via ebreak, must have interrupts disabled. */ - li t0, EXC_BREAKPOINT - beq s4, t0, 1f -#ifdef CONFIG_TRACE_IRQFLAGS - call __trace_hardirqs_on -#endif - csrs CSR_STATUS, SR_IE - + tail do_irq 1: - la ra, ret_from_exception - /* Handle syscalls */ - li t0, EXC_SYSCALL - beq s4, t0, handle_syscall - /* Handle other exceptions */ slli t0, s4, RISCV_LGPTR la t1, excp_vect_table la t2, excp_vect_table_end - move a0, sp /* pt_regs */ add t0, t1, t0 /* Check if exception code lies within bounds */ bgeu t0, t2, 1f @@ -165,95 +125,17 @@ skip_context_tracking: jr t0 1: tail do_trap_unknown +SYM_CODE_END(handle_exception) -handle_syscall: -#ifdef CONFIG_RISCV_M_MODE - /* - * When running is M-Mode (no MMU config), MPIE does not get set. - * As a result, we need to force enable interrupts here because - * handle_exception did not do set SR_IE as it always sees SR_PIE - * being cleared. - */ - csrs CSR_STATUS, SR_IE -#endif -#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING_USER) - /* Recover a0 - a7 for system calls */ - REG_L a0, PT_A0(sp) - REG_L a1, PT_A1(sp) - REG_L a2, PT_A2(sp) - REG_L a3, PT_A3(sp) - REG_L a4, PT_A4(sp) - REG_L a5, PT_A5(sp) - REG_L a6, PT_A6(sp) - REG_L a7, PT_A7(sp) -#endif - /* save the initial A0 value (needed in signal handlers) */ - REG_S a0, PT_ORIG_A0(sp) - /* - * Advance SEPC to avoid executing the original - * scall instruction on sret - */ - addi s2, s2, 0x4 - REG_S s2, PT_EPC(sp) - /* Trace syscalls, but only if requested by the user. */ - REG_L t0, TASK_TI_FLAGS(tp) - andi t0, t0, _TIF_SYSCALL_WORK - bnez t0, handle_syscall_trace_enter -check_syscall_nr: - /* Check to make sure we don't jump to a bogus syscall number. */ - li t0, __NR_syscalls - la s0, sys_ni_syscall - /* - * Syscall number held in a7. - * If syscall number is above allowed value, redirect to ni_syscall. - */ - bgeu a7, t0, 3f -#ifdef CONFIG_COMPAT - REG_L s0, PT_STATUS(sp) - srli s0, s0, SR_UXL_SHIFT - andi s0, s0, (SR_UXL >> SR_UXL_SHIFT) - li t0, (SR_UXL_32 >> SR_UXL_SHIFT) - sub t0, s0, t0 - bnez t0, 1f - - /* Call compat_syscall */ - la s0, compat_sys_call_table - j 2f -1: -#endif - /* Call syscall */ - la s0, sys_call_table -2: - slli t0, a7, RISCV_LGPTR - add s0, s0, t0 - REG_L s0, 0(s0) -3: - jalr s0 - -ret_from_syscall: - /* Set user a0 to kernel a0 */ - REG_S a0, PT_A0(sp) - /* - * We didn't execute the actual syscall. - * Seccomp already set return value for the current task pt_regs. - * (If it was configured with SECCOMP_RET_ERRNO/TRACE) - */ -ret_from_syscall_rejected: -#ifdef CONFIG_DEBUG_RSEQ - move a0, sp - call rseq_syscall -#endif - /* Trace syscalls, but only if requested by the user. */ - REG_L t0, TASK_TI_FLAGS(tp) - andi t0, t0, _TIF_SYSCALL_WORK - bnez t0, handle_syscall_trace_exit - +/* + * The ret_from_exception must be called with interrupt disabled. Here is the + * caller list: + * - handle_exception + * - ret_from_fork + * - ret_from_kernel_thread + */ SYM_CODE_START_NOALIGN(ret_from_exception) REG_L s0, PT_STATUS(sp) - csrc CSR_STATUS, SR_IE -#ifdef CONFIG_TRACE_IRQFLAGS - call __trace_hardirqs_off -#endif #ifdef CONFIG_RISCV_M_MODE /* the MPP value is too large to be used as an immediate arg for addi */ li t0, SR_MPP @@ -261,17 +143,7 @@ SYM_CODE_START_NOALIGN(ret_from_exception) #else andi s0, s0, SR_SPP #endif - bnez s0, resume_kernel -SYM_CODE_END(ret_from_exception) - - /* Interrupts must be disabled here so flags are checked atomically */ - REG_L s0, TASK_TI_FLAGS(tp) /* current_thread_info->flags */ - andi s1, s0, _TIF_WORK_MASK - bnez s1, resume_userspace_slow -resume_userspace: -#ifdef CONFIG_CONTEXT_TRACKING_USER - call user_enter_callable -#endif + bnez s0, 1f /* Save unwound kernel stack pointer in thread_info */ addi s0, sp, PT_SIZE_ON_STACK @@ -282,18 +154,7 @@ resume_userspace: * structures again. */ csrw CSR_SCRATCH, tp - -restore_all: -#ifdef CONFIG_TRACE_IRQFLAGS - REG_L s1, PT_STATUS(sp) - andi t0, s1, SR_PIE - beqz t0, 1f - call __trace_hardirqs_on - j 2f 1: - call __trace_hardirqs_off -2: -#endif REG_L a0, PT_STATUS(sp) /* * The current load reservation is effectively part of the processor's @@ -356,47 +217,10 @@ restore_all: #else sret #endif - -#if IS_ENABLED(CONFIG_PREEMPTION) -resume_kernel: - REG_L s0, TASK_TI_PREEMPT_COUNT(tp) - bnez s0, restore_all - REG_L s0, TASK_TI_FLAGS(tp) - andi s0, s0, _TIF_NEED_RESCHED - beqz s0, restore_all - call preempt_schedule_irq - j restore_all -#endif - -resume_userspace_slow: - /* Enter slow path for supplementary processing */ - move a0, sp /* pt_regs */ - move a1, s0 /* current_thread_info->flags */ - call do_work_pending - j resume_userspace - -/* Slow paths for ptrace. */ -handle_syscall_trace_enter: - move a0, sp - call do_syscall_trace_enter - move t0, a0 - REG_L a0, PT_A0(sp) - REG_L a1, PT_A1(sp) - REG_L a2, PT_A2(sp) - REG_L a3, PT_A3(sp) - REG_L a4, PT_A4(sp) - REG_L a5, PT_A5(sp) - REG_L a6, PT_A6(sp) - REG_L a7, PT_A7(sp) - bnez t0, ret_from_syscall_rejected - j check_syscall_nr -handle_syscall_trace_exit: - move a0, sp - call do_syscall_trace_exit - j ret_from_exception +SYM_CODE_END(ret_from_exception) #ifdef CONFIG_VMAP_STACK -handle_kernel_stack_overflow: +SYM_CODE_START_LOCAL(handle_kernel_stack_overflow) /* * Takes the psuedo-spinlock for the shadow stack, in case multiple * harts are concurrently overflowing their kernel stacks. We could @@ -505,23 +329,25 @@ restore_caller_reg: REG_S s5, PT_TP(sp) move a0, sp tail handle_bad_stack +SYM_CODE_END(handle_kernel_stack_overflow) #endif -END(handle_exception) - -ENTRY(ret_from_fork) +SYM_CODE_START(ret_from_fork) + call schedule_tail + move a0, sp /* pt_regs */ la ra, ret_from_exception - tail schedule_tail -ENDPROC(ret_from_fork) + tail syscall_exit_to_user_mode +SYM_CODE_END(ret_from_fork) -ENTRY(ret_from_kernel_thread) +SYM_CODE_START(ret_from_kernel_thread) call schedule_tail /* Call fn(arg) */ - la ra, ret_from_exception move a0, s1 - jr s0 -ENDPROC(ret_from_kernel_thread) - + jalr s0 + move a0, sp /* pt_regs */ + la ra, ret_from_exception + tail syscall_exit_to_user_mode +SYM_CODE_END(ret_from_kernel_thread) /* * Integer register context switch @@ -533,7 +359,7 @@ ENDPROC(ret_from_kernel_thread) * The value of a0 and a1 must be preserved by this function, as that's how * arguments are passed to schedule_tail. */ -ENTRY(__switch_to) +SYM_FUNC_START(__switch_to) /* Save context into prev->thread */ li a4, TASK_THREAD_RA add a3, a0, a4 @@ -570,7 +396,7 @@ ENTRY(__switch_to) /* The offset of thread_info in task_struct is zero. */ move tp, a1 ret -ENDPROC(__switch_to) +SYM_FUNC_END(__switch_to) #ifndef CONFIG_MMU #define do_page_fault do_trap_unknown @@ -579,7 +405,7 @@ ENDPROC(__switch_to) .section ".rodata" .align LGREG /* Exception vector table */ -ENTRY(excp_vect_table) +SYM_CODE_START(excp_vect_table) RISCV_PTR do_trap_insn_misaligned ALT_INSN_FAULT(RISCV_PTR do_trap_insn_fault) RISCV_PTR do_trap_insn_illegal @@ -588,7 +414,7 @@ ENTRY(excp_vect_table) RISCV_PTR do_trap_load_fault RISCV_PTR do_trap_store_misaligned RISCV_PTR do_trap_store_fault - RISCV_PTR do_trap_ecall_u /* system call, gets intercepted */ + RISCV_PTR do_trap_ecall_u /* system call */ RISCV_PTR do_trap_ecall_s RISCV_PTR do_trap_unknown RISCV_PTR do_trap_ecall_m @@ -598,11 +424,11 @@ ENTRY(excp_vect_table) RISCV_PTR do_trap_unknown RISCV_PTR do_page_fault /* store page fault */ excp_vect_table_end: -END(excp_vect_table) +SYM_CODE_END(excp_vect_table) #ifndef CONFIG_MMU -ENTRY(__user_rt_sigreturn) +SYM_CODE_START(__user_rt_sigreturn) li a7, __NR_rt_sigreturn scall -END(__user_rt_sigreturn) +SYM_CODE_END(__user_rt_sigreturn) #endif diff --git a/arch/riscv/kernel/head.h b/arch/riscv/kernel/head.h index 726731ada534..a556fdaafed9 100644 --- a/arch/riscv/kernel/head.h +++ b/arch/riscv/kernel/head.h @@ -10,7 +10,6 @@ extern atomic_t hart_lottery; -asmlinkage void do_page_fault(struct pt_regs *regs); asmlinkage void __init setup_vm(uintptr_t dtb_pa); #ifdef CONFIG_XIP_KERNEL asmlinkage void __init __copy_data(void); diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 44f4b1ca315d..23c48b14a0e7 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -19,9 +19,6 @@ #include #include -#define CREATE_TRACE_POINTS -#include - enum riscv_regset { REGSET_X, #ifdef CONFIG_FPU @@ -228,46 +225,6 @@ long arch_ptrace(struct task_struct *child, long request, return ret; } -/* - * Allows PTRACE_SYSCALL to work. These are called from entry.S in - * {handle,ret_from}_syscall. - */ -__visible int do_syscall_trace_enter(struct pt_regs *regs) -{ - if (test_thread_flag(TIF_SYSCALL_TRACE)) - if (ptrace_report_syscall_entry(regs)) - return -1; - - /* - * Do the secure computing after ptrace; failures should be fast. - * If this fails we might have return value in a0 from seccomp - * (via SECCOMP_RET_ERRNO/TRACE). - */ - if (secure_computing() == -1) - return -1; - -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS - if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) - trace_sys_enter(regs, syscall_get_nr(current, regs)); -#endif - - audit_syscall_entry(regs->a7, regs->a0, regs->a1, regs->a2, regs->a3); - return 0; -} - -__visible void do_syscall_trace_exit(struct pt_regs *regs) -{ - audit_syscall_exit(regs); - - if (test_thread_flag(TIF_SYSCALL_TRACE)) - ptrace_report_syscall_exit(regs, 0); - -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS - if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) - trace_sys_exit(regs, regs_return_value(regs)); -#endif -} - #ifdef CONFIG_COMPAT static int compat_riscv_gpr_get(struct task_struct *target, const struct user_regset *regset, diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index bfb2afa4135f..2e365084417e 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -274,7 +275,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) signal_setup_done(ret, ksig, 0); } -static void do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs) { struct ksignal ksig; @@ -311,29 +312,3 @@ static void do_signal(struct pt_regs *regs) */ restore_saved_sigmask(); } - -/* - * Handle any pending work on the resume-to-userspace path, as indicated by - * _TIF_WORK_MASK. Entered from assembly with IRQs off. - */ -asmlinkage __visible void do_work_pending(struct pt_regs *regs, - unsigned long thread_info_flags) -{ - do { - if (thread_info_flags & _TIF_NEED_RESCHED) { - schedule(); - } else { - local_irq_enable(); - if (thread_info_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - /* Handle pending signal delivery */ - if (thread_info_flags & (_TIF_SIGPENDING | - _TIF_NOTIFY_SIGNAL)) - do_signal(regs); - if (thread_info_flags & _TIF_NOTIFY_RESUME) - resume_user_mode_work(regs); - } - local_irq_disable(); - thread_info_flags = read_thread_flags(); - } while (thread_info_flags & _TIF_WORK_MASK); -} diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 02727a4fe8be..1f4e37be7eb3 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -17,12 +17,14 @@ #include #include #include +#include #include #include #include #include #include +#include #include int show_unhandled_signals = 1; @@ -123,10 +125,18 @@ static void do_trap_error(struct pt_regs *regs, int signo, int code, #else #define __trap_section noinstr #endif -#define DO_ERROR_INFO(name, signo, code, str) \ -asmlinkage __visible __trap_section void name(struct pt_regs *regs) \ -{ \ - do_trap_error(regs, signo, code, regs->epc, "Oops - " str); \ +#define DO_ERROR_INFO(name, signo, code, str) \ +asmlinkage __visible __trap_section void name(struct pt_regs *regs) \ +{ \ + if (user_mode(regs)) { \ + irqentry_enter_from_user_mode(regs); \ + do_trap_error(regs, signo, code, regs->epc, "Oops - " str); \ + irqentry_exit_to_user_mode(regs); \ + } else { \ + irqentry_state_t state = irqentry_nmi_enter(regs); \ + do_trap_error(regs, signo, code, regs->epc, "Oops - " str); \ + irqentry_nmi_exit(regs, state); \ + } \ } DO_ERROR_INFO(do_trap_unknown, @@ -148,26 +158,50 @@ DO_ERROR_INFO(do_trap_store_misaligned, int handle_misaligned_load(struct pt_regs *regs); int handle_misaligned_store(struct pt_regs *regs); -asmlinkage void __trap_section do_trap_load_misaligned(struct pt_regs *regs) +asmlinkage __visible __trap_section void do_trap_load_misaligned(struct pt_regs *regs) { - if (!handle_misaligned_load(regs)) - return; - do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, - "Oops - load address misaligned"); + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + + if (handle_misaligned_load(regs)) + do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, + "Oops - load address misaligned"); + + irqentry_exit_to_user_mode(regs); + } else { + irqentry_state_t state = irqentry_nmi_enter(regs); + + if (handle_misaligned_load(regs)) + do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, + "Oops - load address misaligned"); + + irqentry_nmi_exit(regs, state); + } } -asmlinkage void __trap_section do_trap_store_misaligned(struct pt_regs *regs) +asmlinkage __visible __trap_section void do_trap_store_misaligned(struct pt_regs *regs) { - if (!handle_misaligned_store(regs)) - return; - do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, - "Oops - store (or AMO) address misaligned"); + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + + if (handle_misaligned_store(regs)) + do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, + "Oops - store (or AMO) address misaligned"); + + irqentry_exit_to_user_mode(regs); + } else { + irqentry_state_t state = irqentry_nmi_enter(regs); + + if (handle_misaligned_store(regs)) + do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, + "Oops - store (or AMO) address misaligned"); + + irqentry_nmi_exit(regs, state); + } } #endif DO_ERROR_INFO(do_trap_store_fault, SIGSEGV, SEGV_ACCERR, "store (or AMO) access fault"); -DO_ERROR_INFO(do_trap_ecall_u, - SIGILL, ILL_ILLTRP, "environment call from U-mode"); DO_ERROR_INFO(do_trap_ecall_s, SIGILL, ILL_ILLTRP, "environment call from S-mode"); DO_ERROR_INFO(do_trap_ecall_m, @@ -183,7 +217,7 @@ static inline unsigned long get_break_insn_length(unsigned long pc) return GET_INSN_LENGTH(insn); } -asmlinkage __visible __trap_section void do_trap_break(struct pt_regs *regs) +void handle_break(struct pt_regs *regs) { #ifdef CONFIG_KPROBES if (kprobe_single_step_handler(regs)) @@ -213,7 +247,77 @@ asmlinkage __visible __trap_section void do_trap_break(struct pt_regs *regs) else die(regs, "Kernel BUG"); } -NOKPROBE_SYMBOL(do_trap_break); + +asmlinkage __visible __trap_section void do_trap_break(struct pt_regs *regs) +{ + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + + handle_break(regs); + + irqentry_exit_to_user_mode(regs); + } else { + irqentry_state_t state = irqentry_nmi_enter(regs); + + handle_break(regs); + + irqentry_nmi_exit(regs, state); + } +} + +asmlinkage __visible __trap_section void do_trap_ecall_u(struct pt_regs *regs) +{ + if (user_mode(regs)) { + ulong syscall = regs->a7; + + syscall = syscall_enter_from_user_mode(regs, syscall); + + regs->epc += 4; + regs->orig_a0 = regs->a0; + + if (syscall < NR_syscalls) + syscall_handler(regs, syscall); + else + regs->a0 = -ENOSYS; + + syscall_exit_to_user_mode(regs); + } else { + irqentry_state_t state = irqentry_nmi_enter(regs); + + do_trap_error(regs, SIGILL, ILL_ILLTRP, regs->epc, + "Oops - environment call from U-mode"); + + irqentry_nmi_exit(regs, state); + } + +} + +#ifdef CONFIG_MMU +asmlinkage __visible noinstr void do_page_fault(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + + handle_page_fault(regs); + + local_irq_disable(); + + irqentry_exit(regs, state); +} +#endif + +asmlinkage __visible noinstr void do_irq(struct pt_regs *regs) +{ + struct pt_regs *old_regs; + irqentry_state_t state = irqentry_enter(regs); + + irq_enter_rcu(); + old_regs = set_irq_regs(regs); + handle_arch_irq(regs); + set_irq_regs(old_regs); + irq_exit_rcu(); + + irqentry_exit(regs, state); +} #ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long pc) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 460f785f6e09..3aba72ec1742 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -204,7 +205,7 @@ static inline bool access_error(unsigned long cause, struct vm_area_struct *vma) * This routine handles page faults. It determines the address and the * problem, and then passes it off to one of the appropriate routines. */ -asmlinkage void do_page_fault(struct pt_regs *regs) +void handle_page_fault(struct pt_regs *regs) { struct task_struct *tsk; struct vm_area_struct *vma; @@ -251,7 +252,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs) } #endif /* Enable interrupts if they were enabled in the parent context. */ - if (likely(regs->status & SR_PIE)) + if (!regs_irqs_disabled(regs)) local_irq_enable(); /* @@ -356,4 +357,3 @@ good_area: } return; } -NOKPROBE_SYMBOL(do_page_fault); -- cgit v1.2.3 From 0bf298ad2b61ae69d38826f3513e5fffc3fe3a53 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 21 Feb 2023 22:30:19 -0500 Subject: riscv: entry: Remove extra level wrappers of trace_hardirqs_{on,off} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since riscv is converted to generic entry, there's no need for the extra wrappers of trace_hardirqs_{on,off}. Signed-off-by: Jisheng Zhang Reviewed-by: Guo Ren Reviewed-by: Björn Töpel Tested-by: Guo Ren Signed-off-by: Guo Ren Link: https://lore.kernel.org/r/20230222033021.983168-6-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/Makefile | 2 -- arch/riscv/kernel/trace_irq.c | 27 --------------------------- arch/riscv/kernel/trace_irq.h | 11 ----------- 3 files changed, 40 deletions(-) delete mode 100644 arch/riscv/kernel/trace_irq.c delete mode 100644 arch/riscv/kernel/trace_irq.h diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 4cf303a779ab..392fa6e35d4a 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -68,8 +68,6 @@ obj-$(CONFIG_CPU_PM) += suspend_entry.o suspend.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o obj-$(CONFIG_DYNAMIC_FTRACE) += mcount-dyn.o -obj-$(CONFIG_TRACE_IRQFLAGS) += trace_irq.o - obj-$(CONFIG_PERF_EVENTS) += perf_callchain.o obj-$(CONFIG_HAVE_PERF_REGS) += perf_regs.o obj-$(CONFIG_RISCV_SBI) += sbi.o diff --git a/arch/riscv/kernel/trace_irq.c b/arch/riscv/kernel/trace_irq.c deleted file mode 100644 index 095ac976d7da..000000000000 --- a/arch/riscv/kernel/trace_irq.c +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2022 Changbin Du - */ - -#include -#include -#include "trace_irq.h" - -/* - * trace_hardirqs_on/off require the caller to setup frame pointer properly. - * Otherwise, CALLER_ADDR1 might trigger an pagging exception in kernel. - * Here we add one extra level so they can be safely called by low - * level entry code which $fp is used for other purpose. - */ - -void __trace_hardirqs_on(void) -{ - trace_hardirqs_on(); -} -NOKPROBE_SYMBOL(__trace_hardirqs_on); - -void __trace_hardirqs_off(void) -{ - trace_hardirqs_off(); -} -NOKPROBE_SYMBOL(__trace_hardirqs_off); diff --git a/arch/riscv/kernel/trace_irq.h b/arch/riscv/kernel/trace_irq.h deleted file mode 100644 index 99fe67377e5e..000000000000 --- a/arch/riscv/kernel/trace_irq.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2022 Changbin Du - */ -#ifndef __TRACE_IRQ_H -#define __TRACE_IRQ_H - -void __trace_hardirqs_on(void); -void __trace_hardirqs_off(void); - -#endif /* __TRACE_IRQ_H */ -- cgit v1.2.3 From ab9164dae27334415537ccf1c3fbabf56b7793b2 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 21 Feb 2023 22:30:20 -0500 Subject: riscv: entry: Consolidate ret_from_kernel_thread into ret_from_fork MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ret_from_kernel_thread() behaves similarly with ret_from_fork(), the only difference is whether call the fn(arg) or not, this can be achieved by testing fn is NULL or not, I.E s0 is 0 or not. Many architectures have done the same thing, it makes entry.S more clean. Signed-off-by: Jisheng Zhang Reviewed-by: Björn Töpel Reviewed-by: Guo Ren Tested-by: Guo Ren Signed-off-by: Guo Ren Link: https://lore.kernel.org/r/20230222033021.983168-7-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/entry.S | 12 +++--------- arch/riscv/kernel/process.c | 5 ++--- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index bc322f92ba34..5ccef259498d 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -132,7 +132,6 @@ SYM_CODE_END(handle_exception) * caller list: * - handle_exception * - ret_from_fork - * - ret_from_kernel_thread */ SYM_CODE_START_NOALIGN(ret_from_exception) REG_L s0, PT_STATUS(sp) @@ -334,20 +333,15 @@ SYM_CODE_END(handle_kernel_stack_overflow) SYM_CODE_START(ret_from_fork) call schedule_tail - move a0, sp /* pt_regs */ - la ra, ret_from_exception - tail syscall_exit_to_user_mode -SYM_CODE_END(ret_from_fork) - -SYM_CODE_START(ret_from_kernel_thread) - call schedule_tail + beqz s0, 1f /* not from kernel thread */ /* Call fn(arg) */ move a0, s1 jalr s0 +1: move a0, sp /* pt_regs */ la ra, ret_from_exception tail syscall_exit_to_user_mode -SYM_CODE_END(ret_from_kernel_thread) +SYM_CODE_END(ret_from_fork) /* * Integer register context switch diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 774ffde386ab..e2a060066730 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -34,7 +34,6 @@ EXPORT_SYMBOL(__stack_chk_guard); #endif extern asmlinkage void ret_from_fork(void); -extern asmlinkage void ret_from_kernel_thread(void); void arch_cpu_idle(void) { @@ -173,7 +172,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) /* Supervisor/Machine, irqs on: */ childregs->status = SR_PP | SR_PIE; - p->thread.ra = (unsigned long)ret_from_kernel_thread; p->thread.s[0] = (unsigned long)args->fn; p->thread.s[1] = (unsigned long)args->fn_arg; } else { @@ -183,8 +181,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) if (clone_flags & CLONE_SETTLS) childregs->tp = tls; childregs->a0 = 0; /* Return value of fork() */ - p->thread.ra = (unsigned long)ret_from_fork; + p->thread.s[0] = 0; } + p->thread.ra = (unsigned long)ret_from_fork; p->thread.sp = (unsigned long)childregs; /* kernel sp */ return 0; } -- cgit v1.2.3 From 45b32b946a97bb4553bab30c2519bbaac55f39db Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 21 Feb 2023 22:30:21 -0500 Subject: riscv: entry: Consolidate general regs saving/restoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consolidate the saving/restoring GPs (except zero, ra, sp, gp, tp and t0) into save_from_x6_to_x31/restore_from_x6_to_x31 macros. No functional change intended. Signed-off-by: Jisheng Zhang Reviewed-by: Guo Ren Reviewed-by: Björn Töpel Tested-by: Guo Ren Signed-off-by: Guo Ren Link: https://lore.kernel.org/r/20230222033021.983168-8-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/asm.h | 61 +++++++++++++++++++++++++++++++ arch/riscv/kernel/entry.S | 81 ++---------------------------------------- arch/riscv/kernel/mcount-dyn.S | 57 +++-------------------------- 3 files changed, 68 insertions(+), 131 deletions(-) diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h index 816e753de636..114bbadaef41 100644 --- a/arch/riscv/include/asm/asm.h +++ b/arch/riscv/include/asm/asm.h @@ -69,6 +69,7 @@ #endif #ifdef __ASSEMBLY__ +#include /* Common assembly source macros */ @@ -81,6 +82,66 @@ .endr .endm + /* save all GPs except x1 ~ x5 */ + .macro save_from_x6_to_x31 + REG_S x6, PT_T1(sp) + REG_S x7, PT_T2(sp) + REG_S x8, PT_S0(sp) + REG_S x9, PT_S1(sp) + REG_S x10, PT_A0(sp) + REG_S x11, PT_A1(sp) + REG_S x12, PT_A2(sp) + REG_S x13, PT_A3(sp) + REG_S x14, PT_A4(sp) + REG_S x15, PT_A5(sp) + REG_S x16, PT_A6(sp) + REG_S x17, PT_A7(sp) + REG_S x18, PT_S2(sp) + REG_S x19, PT_S3(sp) + REG_S x20, PT_S4(sp) + REG_S x21, PT_S5(sp) + REG_S x22, PT_S6(sp) + REG_S x23, PT_S7(sp) + REG_S x24, PT_S8(sp) + REG_S x25, PT_S9(sp) + REG_S x26, PT_S10(sp) + REG_S x27, PT_S11(sp) + REG_S x28, PT_T3(sp) + REG_S x29, PT_T4(sp) + REG_S x30, PT_T5(sp) + REG_S x31, PT_T6(sp) + .endm + + /* restore all GPs except x1 ~ x5 */ + .macro restore_from_x6_to_x31 + REG_L x6, PT_T1(sp) + REG_L x7, PT_T2(sp) + REG_L x8, PT_S0(sp) + REG_L x9, PT_S1(sp) + REG_L x10, PT_A0(sp) + REG_L x11, PT_A1(sp) + REG_L x12, PT_A2(sp) + REG_L x13, PT_A3(sp) + REG_L x14, PT_A4(sp) + REG_L x15, PT_A5(sp) + REG_L x16, PT_A6(sp) + REG_L x17, PT_A7(sp) + REG_L x18, PT_S2(sp) + REG_L x19, PT_S3(sp) + REG_L x20, PT_S4(sp) + REG_L x21, PT_S5(sp) + REG_L x22, PT_S6(sp) + REG_L x23, PT_S7(sp) + REG_L x24, PT_S8(sp) + REG_L x25, PT_S9(sp) + REG_L x26, PT_S10(sp) + REG_L x27, PT_S11(sp) + REG_L x28, PT_T3(sp) + REG_L x29, PT_T4(sp) + REG_L x30, PT_T5(sp) + REG_L x31, PT_T6(sp) + .endm + #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_ASM_H */ diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 5ccef259498d..3fbb100bc9e4 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -42,32 +42,7 @@ _save_context: REG_S x1, PT_RA(sp) REG_S x3, PT_GP(sp) REG_S x5, PT_T0(sp) - REG_S x6, PT_T1(sp) - REG_S x7, PT_T2(sp) - REG_S x8, PT_S0(sp) - REG_S x9, PT_S1(sp) - REG_S x10, PT_A0(sp) - REG_S x11, PT_A1(sp) - REG_S x12, PT_A2(sp) - REG_S x13, PT_A3(sp) - REG_S x14, PT_A4(sp) - REG_S x15, PT_A5(sp) - REG_S x16, PT_A6(sp) - REG_S x17, PT_A7(sp) - REG_S x18, PT_S2(sp) - REG_S x19, PT_S3(sp) - REG_S x20, PT_S4(sp) - REG_S x21, PT_S5(sp) - REG_S x22, PT_S6(sp) - REG_S x23, PT_S7(sp) - REG_S x24, PT_S8(sp) - REG_S x25, PT_S9(sp) - REG_S x26, PT_S10(sp) - REG_S x27, PT_S11(sp) - REG_S x28, PT_T3(sp) - REG_S x29, PT_T4(sp) - REG_S x30, PT_T5(sp) - REG_S x31, PT_T6(sp) + save_from_x6_to_x31 /* * Disable user-mode memory access as it should only be set in the @@ -182,32 +157,7 @@ SYM_CODE_START_NOALIGN(ret_from_exception) REG_L x3, PT_GP(sp) REG_L x4, PT_TP(sp) REG_L x5, PT_T0(sp) - REG_L x6, PT_T1(sp) - REG_L x7, PT_T2(sp) - REG_L x8, PT_S0(sp) - REG_L x9, PT_S1(sp) - REG_L x10, PT_A0(sp) - REG_L x11, PT_A1(sp) - REG_L x12, PT_A2(sp) - REG_L x13, PT_A3(sp) - REG_L x14, PT_A4(sp) - REG_L x15, PT_A5(sp) - REG_L x16, PT_A6(sp) - REG_L x17, PT_A7(sp) - REG_L x18, PT_S2(sp) - REG_L x19, PT_S3(sp) - REG_L x20, PT_S4(sp) - REG_L x21, PT_S5(sp) - REG_L x22, PT_S6(sp) - REG_L x23, PT_S7(sp) - REG_L x24, PT_S8(sp) - REG_L x25, PT_S9(sp) - REG_L x26, PT_S10(sp) - REG_L x27, PT_S11(sp) - REG_L x28, PT_T3(sp) - REG_L x29, PT_T4(sp) - REG_L x30, PT_T5(sp) - REG_L x31, PT_T6(sp) + restore_from_x6_to_x31 REG_L x2, PT_SP(sp) @@ -287,32 +237,7 @@ restore_caller_reg: REG_S x1, PT_RA(sp) REG_S x3, PT_GP(sp) REG_S x5, PT_T0(sp) - REG_S x6, PT_T1(sp) - REG_S x7, PT_T2(sp) - REG_S x8, PT_S0(sp) - REG_S x9, PT_S1(sp) - REG_S x10, PT_A0(sp) - REG_S x11, PT_A1(sp) - REG_S x12, PT_A2(sp) - REG_S x13, PT_A3(sp) - REG_S x14, PT_A4(sp) - REG_S x15, PT_A5(sp) - REG_S x16, PT_A6(sp) - REG_S x17, PT_A7(sp) - REG_S x18, PT_S2(sp) - REG_S x19, PT_S3(sp) - REG_S x20, PT_S4(sp) - REG_S x21, PT_S5(sp) - REG_S x22, PT_S6(sp) - REG_S x23, PT_S7(sp) - REG_S x24, PT_S8(sp) - REG_S x25, PT_S9(sp) - REG_S x26, PT_S10(sp) - REG_S x27, PT_S11(sp) - REG_S x28, PT_T3(sp) - REG_S x29, PT_T4(sp) - REG_S x30, PT_T5(sp) - REG_S x31, PT_T6(sp) + save_from_x6_to_x31 REG_L s0, TASK_TI_KERNEL_SP(tp) csrr s1, CSR_STATUS diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index 125de818d1ba..669b8697aa38 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -66,66 +66,17 @@ REG_S x3, PT_GP(sp) REG_S x4, PT_TP(sp) REG_S x5, PT_T0(sp) - REG_S x6, PT_T1(sp) - REG_S x7, PT_T2(sp) - REG_S x8, PT_S0(sp) - REG_S x9, PT_S1(sp) - REG_S x10, PT_A0(sp) - REG_S x11, PT_A1(sp) - REG_S x12, PT_A2(sp) - REG_S x13, PT_A3(sp) - REG_S x14, PT_A4(sp) - REG_S x15, PT_A5(sp) - REG_S x16, PT_A6(sp) - REG_S x17, PT_A7(sp) - REG_S x18, PT_S2(sp) - REG_S x19, PT_S3(sp) - REG_S x20, PT_S4(sp) - REG_S x21, PT_S5(sp) - REG_S x22, PT_S6(sp) - REG_S x23, PT_S7(sp) - REG_S x24, PT_S8(sp) - REG_S x25, PT_S9(sp) - REG_S x26, PT_S10(sp) - REG_S x27, PT_S11(sp) - REG_S x28, PT_T3(sp) - REG_S x29, PT_T4(sp) - REG_S x30, PT_T5(sp) - REG_S x31, PT_T6(sp) + save_from_x6_to_x31 .endm .macro RESTORE_ALL - REG_L t0, PT_EPC(sp) REG_L x1, PT_RA(sp) REG_L x2, PT_SP(sp) REG_L x3, PT_GP(sp) REG_L x4, PT_TP(sp) - REG_L x6, PT_T1(sp) - REG_L x7, PT_T2(sp) - REG_L x8, PT_S0(sp) - REG_L x9, PT_S1(sp) - REG_L x10, PT_A0(sp) - REG_L x11, PT_A1(sp) - REG_L x12, PT_A2(sp) - REG_L x13, PT_A3(sp) - REG_L x14, PT_A4(sp) - REG_L x15, PT_A5(sp) - REG_L x16, PT_A6(sp) - REG_L x17, PT_A7(sp) - REG_L x18, PT_S2(sp) - REG_L x19, PT_S3(sp) - REG_L x20, PT_S4(sp) - REG_L x21, PT_S5(sp) - REG_L x22, PT_S6(sp) - REG_L x23, PT_S7(sp) - REG_L x24, PT_S8(sp) - REG_L x25, PT_S9(sp) - REG_L x26, PT_S10(sp) - REG_L x27, PT_S11(sp) - REG_L x28, PT_T3(sp) - REG_L x29, PT_T4(sp) - REG_L x30, PT_T5(sp) - REG_L x31, PT_T6(sp) + /* Restore t0 with PT_EPC */ + REG_L x5, PT_EPC(sp) + restore_from_x6_to_x31 addi sp, sp, PT_SIZE_ON_STACK .endm -- cgit v1.2.3 From b5e2c507b06c9d36411845149162a804ae7b04a9 Mon Sep 17 00:00:00 2001 From: Yimin Gu Date: Tue, 28 Feb 2023 19:26:56 -0500 Subject: riscv: Kconfig: Allow RV32 to build with no MMU Some RISC-V 32bit cores do not have an MMU, and the kernel should be able to build for them. This patch enables the RV32 to be built with no MMU support. Signed-off-by: Yimin Gu CC: Jesse Taube Tested-by: Waldemar Brodkorb Signed-off-by: Jesse Taube Reviewed-by: Damien Le Moal Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230301002657.352637-3-Mr.Bossman075@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c5e42cc37604..d1f661425790 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -177,8 +177,8 @@ config MMU config PAGE_OFFSET hex - default 0xC0000000 if 32BIT - default 0x80000000 if 64BIT && !MMU + default 0xC0000000 if 32BIT && MMU + default 0x80000000 if !MMU default 0xff60000000000000 if 64BIT config KASAN_SHADOW_OFFSET @@ -279,7 +279,6 @@ config ARCH_RV32I select GENERIC_LIB_ASHRDI3 select GENERIC_LIB_LSHRDI3 select GENERIC_LIB_UCMPDI2 - select MMU config ARCH_RV64I bool "RV64I" -- cgit v1.2.3 From 77c0c966719f07f14c82f358745f77582580cdcd Mon Sep 17 00:00:00 2001 From: Jesse Taube Date: Tue, 28 Feb 2023 19:26:57 -0500 Subject: riscv: configs: Add nommu PHONY defconfig for RV32 32bit risc-v can be configured to run without MMU. Introduce rv32_nommu_virt_defconfig .PHONY target, that is based on nommu_virt_defconfig. This is similar to how rv32_defconfig is based on "defconfig". Suggested-by: Conor Dooley Signed-off-by: Jesse Taube Cc: Yimin Gu Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230301002657.352637-4-Mr.Bossman075@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 6203c3378922..1b276f62f22b 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -174,3 +174,7 @@ rv64_randconfig: PHONY += rv32_defconfig rv32_defconfig: $(Q)$(MAKE) -f $(srctree)/Makefile defconfig 32-bit.config + +PHONY += rv32_nommu_virt_defconfig +rv32_nommu_virt_defconfig: + $(Q)$(MAKE) -f $(srctree)/Makefile nommu_virt_defconfig 32-bit.config -- cgit v1.2.3 From d34a6b715a23ccd9c9d0bc7a475bea59dc3e28b2 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Fri, 24 Mar 2023 12:12:41 +0000 Subject: RISC-V: convert new selectors of RISCV_ALTERNATIVE to dependencies for-next contains two additional extensions that select RISCV_ALTERNATIVE. RISCV_ALTERNATIVE no longer needs to be selected by individual config options as it is now selected for !XIP_KERNEL builds by the top level RISCV option. These extensions rely on the alternative framework, so convert the "select"s to "depends on"s instead. Signed-off-by: Conor Dooley Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20230324121240.3594777-1-conor.dooley@microchip.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index b47c08364dc3..cc02eb9eee1f 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -402,8 +402,8 @@ config RISCV_ISA_C config RISCV_ISA_SVNAPOT bool "SVNAPOT extension support" depends on 64BIT && MMU + depends on RISCV_ALTERNATIVE default y - select RISCV_ALTERNATIVE help Allow kernel to detect the SVNAPOT ISA-extension dynamically at boot time and enable its usage. @@ -478,8 +478,8 @@ config RISCV_ISA_ZICBOM config RISCV_ISA_ZICBOZ bool "Zicboz extension support for faster zeroing of memory" - depends on !XIP_KERNEL && MMU - select RISCV_ALTERNATIVE + depends on MMU + depends on RISCV_ALTERNATIVE default y help Enable the use of the ZICBOZ extension (cbo.zero instruction) -- cgit v1.2.3 From 9c2598d43510eff09c658f9c0e0f921ba1871c4b Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 3 Apr 2023 08:52:07 +0200 Subject: riscv: entry: Save a0 prior syscall_enter_from_user_mode() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RISC-V calling convention passes the first argument, and the return value in the a0 register. For this reason, the a0 register needs some extra care; When handling syscalls, the a0 register is saved into regs->orig_a0, so a0 can be properly restored for, e.g. interrupted syscalls. This functionality was broken with the introduction of the generic entry patches. Here, a0 was saved into orig_a0 after calling syscall_enter_from_user_mode(), which can change regs->a0 for some paths, incorrectly restoring a0. This is resolved, by saving a0 prior doing the syscall_enter_from_user_mode() call. Fixes: f0bddf50586d ("riscv: entry: Convert to generic entry") Reviewed-by: Heiko Stuebner Tested-by: Heiko Stuebner Signed-off-by: Björn Töpel Reported-by: Conor Dooley Reviewed-by: Conor Dooley Tested-by: Conor Dooley Tested-by: Geert Uytterhoeven Tested-by: Andy Chiu Link: https://lore.kernel.org/r/20230403065207.1070974-1-bjorn@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 1f4e37be7eb3..8c258b78c925 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -270,11 +270,11 @@ asmlinkage __visible __trap_section void do_trap_ecall_u(struct pt_regs *regs) if (user_mode(regs)) { ulong syscall = regs->a7; - syscall = syscall_enter_from_user_mode(regs, syscall); - regs->epc += 4; regs->orig_a0 = regs->a0; + syscall = syscall_enter_from_user_mode(regs, syscall); + if (syscall < NR_syscalls) syscall_handler(regs, syscall); else -- cgit v1.2.3 From 6a24915145c922b79d3ac78f681137a4c14a6d6b Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Wed, 8 Mar 2023 14:47:34 +0800 Subject: Revert "riscv: Set more data to cacheinfo" This reverts commit baf7cbd94b5688f167443a2cc3dcea3300132099. There are some duplicate cache attributes populations executed in both ci_leaf_init() and later cache_setup_properties(). Revert the commit baf7cbd94b56 ("riscv: Set more data to cacheinfo") to setup only the level and type attributes at this early place. Signed-off-by: Song Shuai Acked-by: Sudeep Holla Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20230308064734.512457-1-suagrfillet@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cacheinfo.c | 66 ++++++++++--------------------------------- 1 file changed, 15 insertions(+), 51 deletions(-) diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index 3a13113f1b29..305ebbdc780d 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -64,53 +64,12 @@ uintptr_t get_cache_geometry(u32 level, enum cache_type type) 0; } -static void ci_leaf_init(struct cacheinfo *this_leaf, enum cache_type type, - unsigned int level, unsigned int size, - unsigned int sets, unsigned int line_size) +static void ci_leaf_init(struct cacheinfo *this_leaf, + struct device_node *node, + enum cache_type type, unsigned int level) { this_leaf->level = level; this_leaf->type = type; - this_leaf->size = size; - this_leaf->number_of_sets = sets; - this_leaf->coherency_line_size = line_size; - - /* - * If the cache is fully associative, there is no need to - * check the other properties. - */ - if (sets == 1) - return; - - /* - * Set the ways number for n-ways associative, make sure - * all properties are big than zero. - */ - if (sets > 0 && size > 0 && line_size > 0) - this_leaf->ways_of_associativity = (size / sets) / line_size; -} - -static void fill_cacheinfo(struct cacheinfo **this_leaf, - struct device_node *node, unsigned int level) -{ - unsigned int size, sets, line_size; - - if (!of_property_read_u32(node, "cache-size", &size) && - !of_property_read_u32(node, "cache-block-size", &line_size) && - !of_property_read_u32(node, "cache-sets", &sets)) { - ci_leaf_init((*this_leaf)++, CACHE_TYPE_UNIFIED, level, size, sets, line_size); - } - - if (!of_property_read_u32(node, "i-cache-size", &size) && - !of_property_read_u32(node, "i-cache-sets", &sets) && - !of_property_read_u32(node, "i-cache-block-size", &line_size)) { - ci_leaf_init((*this_leaf)++, CACHE_TYPE_INST, level, size, sets, line_size); - } - - if (!of_property_read_u32(node, "d-cache-size", &size) && - !of_property_read_u32(node, "d-cache-sets", &sets) && - !of_property_read_u32(node, "d-cache-block-size", &line_size)) { - ci_leaf_init((*this_leaf)++, CACHE_TYPE_DATA, level, size, sets, line_size); - } } int populate_cache_leaves(unsigned int cpu) @@ -121,24 +80,29 @@ int populate_cache_leaves(unsigned int cpu) struct device_node *prev = NULL; int levels = 1, level = 1; - /* Level 1 caches in cpu node */ - fill_cacheinfo(&this_leaf, np, level); + if (of_property_read_bool(np, "cache-size")) + ci_leaf_init(this_leaf++, np, CACHE_TYPE_UNIFIED, level); + if (of_property_read_bool(np, "i-cache-size")) + ci_leaf_init(this_leaf++, np, CACHE_TYPE_INST, level); + if (of_property_read_bool(np, "d-cache-size")) + ci_leaf_init(this_leaf++, np, CACHE_TYPE_DATA, level); - /* Next level caches in cache nodes */ prev = np; while ((np = of_find_next_cache_node(np))) { of_node_put(prev); prev = np; - if (!of_device_is_compatible(np, "cache")) break; if (of_property_read_u32(np, "cache-level", &level)) break; if (level <= levels) break; - - fill_cacheinfo(&this_leaf, np, level); - + if (of_property_read_bool(np, "cache-size")) + ci_leaf_init(this_leaf++, np, CACHE_TYPE_UNIFIED, level); + if (of_property_read_bool(np, "i-cache-size")) + ci_leaf_init(this_leaf++, np, CACHE_TYPE_INST, level); + if (of_property_read_bool(np, "d-cache-size")) + ci_leaf_init(this_leaf++, np, CACHE_TYPE_DATA, level); levels = level; } of_node_put(np); -- cgit v1.2.3 From ff77cf5b2e033d0bb5e3b7f83ebf65c5adc20d12 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 7 Apr 2023 16:10:58 -0700 Subject: RISC-V: Move struct riscv_cpuinfo to new header In preparation for tracking and exposing microarchitectural details to userspace (like whether or not unaligned accesses are fast), move the riscv_cpuinfo struct out to its own new cpufeatures.h header. It will need to be used by more than just cpu.c. Signed-off-by: Evan Green Reviewed-by: Conor Dooley Reviewed-by: Heiko Stuebner Tested-by: Heiko Stuebner Reviewed-by: Paul Walmsley Link: https://lore.kernel.org/r/20230407231103.2622178-2-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpufeature.h | 21 +++++++++++++++++++++ arch/riscv/kernel/cpu.c | 8 ++------ 2 files changed, 23 insertions(+), 6 deletions(-) create mode 100644 arch/riscv/include/asm/cpufeature.h diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h new file mode 100644 index 000000000000..66ebaae449c8 --- /dev/null +++ b/arch/riscv/include/asm/cpufeature.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2022-2023 Rivos, Inc + */ + +#ifndef _ASM_CPUFEATURE_H +#define _ASM_CPUFEATURE_H + +/* + * These are probed via a device_initcall(), via either the SBI or directly + * from the corresponding CSRs. + */ +struct riscv_cpuinfo { + unsigned long mvendorid; + unsigned long marchid; + unsigned long mimpid; +}; + +DECLARE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo); + +#endif diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index 8400f0cc9704..dafd0caa4f1d 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -70,12 +71,7 @@ int riscv_of_parent_hartid(struct device_node *node, unsigned long *hartid) return -1; } -struct riscv_cpuinfo { - unsigned long mvendorid; - unsigned long marchid; - unsigned long mimpid; -}; -static DEFINE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo); +DEFINE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo); unsigned long riscv_cached_mvendorid(unsigned int cpu_id) { -- cgit v1.2.3 From ea3de9ce8aa280c5175c835bd3e94a3a9b814b74 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 7 Apr 2023 16:10:59 -0700 Subject: RISC-V: Add a syscall for HW probing We don't have enough space for these all in ELF_HWCAP{,2} and there's no system call that quite does this, so let's just provide an arch-specific one to probe for hardware capabilities. This currently just provides m{arch,imp,vendor}id, but with the key-value pairs we can pass more in the future. Co-developed-by: Palmer Dabbelt Signed-off-by: Evan Green Reviewed-by: Conor Dooley Reviewed-by: Heiko Stuebner Tested-by: Heiko Stuebner Reviewed-by: Paul Walmsley Link: https://lore.kernel.org/r/20230407231103.2622178-3-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/riscv/hwprobe.rst | 41 +++++++++++ Documentation/riscv/index.rst | 1 + arch/riscv/include/asm/hwprobe.h | 13 ++++ arch/riscv/include/asm/syscall.h | 4 + arch/riscv/include/uapi/asm/hwprobe.h | 25 +++++++ arch/riscv/include/uapi/asm/unistd.h | 9 +++ arch/riscv/kernel/sys_riscv.c | 135 +++++++++++++++++++++++++++++++++- 7 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 Documentation/riscv/hwprobe.rst create mode 100644 arch/riscv/include/asm/hwprobe.h create mode 100644 arch/riscv/include/uapi/asm/hwprobe.h diff --git a/Documentation/riscv/hwprobe.rst b/Documentation/riscv/hwprobe.rst new file mode 100644 index 000000000000..211828f706e3 --- /dev/null +++ b/Documentation/riscv/hwprobe.rst @@ -0,0 +1,41 @@ +.. SPDX-License-Identifier: GPL-2.0 + +RISC-V Hardware Probing Interface +--------------------------------- + +The RISC-V hardware probing interface is based around a single syscall, which +is defined in :: + + struct riscv_hwprobe { + __s64 key; + __u64 value; + }; + + long sys_riscv_hwprobe(struct riscv_hwprobe *pairs, size_t pair_count, + size_t cpu_count, cpu_set_t *cpus, + unsigned int flags); + +The arguments are split into three groups: an array of key-value pairs, a CPU +set, and some flags. The key-value pairs are supplied with a count. Userspace +must prepopulate the key field for each element, and the kernel will fill in the +value if the key is recognized. If a key is unknown to the kernel, its key field +will be cleared to -1, and its value set to 0. The CPU set is defined by +CPU_SET(3). For value-like keys (eg. vendor/arch/impl), the returned value will +be only be valid if all CPUs in the given set have the same value. Otherwise -1 +will be returned. For boolean-like keys, the value returned will be a logical +AND of the values for the specified CPUs. Usermode can supply NULL for cpus and +0 for cpu_count as a shortcut for all online CPUs. There are currently no flags, +this value must be zero for future compatibility. + +On success 0 is returned, on failure a negative error code is returned. + +The following keys are defined: + +* :c:macro:`RISCV_HWPROBE_KEY_MVENDORID`: Contains the value of ``mvendorid``, + as defined by the RISC-V privileged architecture specification. + +* :c:macro:`RISCV_HWPROBE_KEY_MARCHID`: Contains the value of ``marchid``, as + defined by the RISC-V privileged architecture specification. + +* :c:macro:`RISCV_HWPROBE_KEY_MIMPLID`: Contains the value of ``mimplid``, as + defined by the RISC-V privileged architecture specification. diff --git a/Documentation/riscv/index.rst b/Documentation/riscv/index.rst index 2e5b18fbb145..175a91db0200 100644 --- a/Documentation/riscv/index.rst +++ b/Documentation/riscv/index.rst @@ -7,6 +7,7 @@ RISC-V architecture boot-image-header vm-layout + hwprobe patch-acceptance uabi diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h new file mode 100644 index 000000000000..6184bbc77256 --- /dev/null +++ b/arch/riscv/include/asm/hwprobe.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright 2023 Rivos, Inc + */ + +#ifndef _ASM_HWPROBE_H +#define _ASM_HWPROBE_H + +#include + +#define RISCV_HWPROBE_MAX_KEY 2 + +#endif diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h index 384a63b86420..3b5a667928bf 100644 --- a/arch/riscv/include/asm/syscall.h +++ b/arch/riscv/include/asm/syscall.h @@ -10,6 +10,7 @@ #ifndef _ASM_RISCV_SYSCALL_H #define _ASM_RISCV_SYSCALL_H +#include #include #include #include @@ -75,4 +76,7 @@ static inline int syscall_get_arch(struct task_struct *task) } asmlinkage long sys_riscv_flush_icache(uintptr_t, uintptr_t, uintptr_t); + +asmlinkage long sys_riscv_hwprobe(struct riscv_hwprobe *, size_t, size_t, + unsigned long *, unsigned int); #endif /* _ASM_RISCV_SYSCALL_H */ diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h new file mode 100644 index 000000000000..b79be00920db --- /dev/null +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright 2023 Rivos, Inc + */ + +#ifndef _UAPI_ASM_HWPROBE_H +#define _UAPI_ASM_HWPROBE_H + +#include + +/* + * Interface for probing hardware capabilities from userspace, see + * Documentation/riscv/hwprobe.rst for more information. + */ +struct riscv_hwprobe { + __s64 key; + __u64 value; +}; + +#define RISCV_HWPROBE_KEY_MVENDORID 0 +#define RISCV_HWPROBE_KEY_MARCHID 1 +#define RISCV_HWPROBE_KEY_MIMPID 2 +/* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ + +#endif diff --git a/arch/riscv/include/uapi/asm/unistd.h b/arch/riscv/include/uapi/asm/unistd.h index 73d7cdd2ec49..950ab3fd4409 100644 --- a/arch/riscv/include/uapi/asm/unistd.h +++ b/arch/riscv/include/uapi/asm/unistd.h @@ -43,3 +43,12 @@ #define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15) #endif __SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache) + +/* + * Allows userspace to query the kernel for CPU architecture and + * microarchitecture details across a given set of CPUs. + */ +#ifndef __NR_riscv_hwprobe +#define __NR_riscv_hwprobe (__NR_arch_specific_syscall + 14) +#endif +__SYSCALL(__NR_riscv_hwprobe, sys_riscv_hwprobe) diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index 5d3f2fbeb33c..fe8e833ecb2e 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -6,8 +6,11 @@ */ #include -#include #include +#include +#include +#include +#include #include static long riscv_sys_mmap(unsigned long addr, unsigned long len, @@ -69,3 +72,133 @@ SYSCALL_DEFINE3(riscv_flush_icache, uintptr_t, start, uintptr_t, end, return 0; } + +/* + * The hwprobe interface, for allowing userspace to probe to see which features + * are supported by the hardware. See Documentation/riscv/hwprobe.rst for more + * details. + */ +static void hwprobe_arch_id(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + u64 id = -1ULL; + bool first = true; + int cpu; + + for_each_cpu(cpu, cpus) { + u64 cpu_id; + + switch (pair->key) { + case RISCV_HWPROBE_KEY_MVENDORID: + cpu_id = riscv_cached_mvendorid(cpu); + break; + case RISCV_HWPROBE_KEY_MIMPID: + cpu_id = riscv_cached_mimpid(cpu); + break; + case RISCV_HWPROBE_KEY_MARCHID: + cpu_id = riscv_cached_marchid(cpu); + break; + } + + if (first) + id = cpu_id; + + /* + * If there's a mismatch for the given set, return -1 in the + * value. + */ + if (id != cpu_id) { + id = -1ULL; + break; + } + } + + pair->value = id; +} + +static void hwprobe_one_pair(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + switch (pair->key) { + case RISCV_HWPROBE_KEY_MVENDORID: + case RISCV_HWPROBE_KEY_MARCHID: + case RISCV_HWPROBE_KEY_MIMPID: + hwprobe_arch_id(pair, cpus); + break; + + /* + * For forward compatibility, unknown keys don't fail the whole + * call, but get their element key set to -1 and value set to 0 + * indicating they're unrecognized. + */ + default: + pair->key = -1; + pair->value = 0; + break; + } +} + +static int do_riscv_hwprobe(struct riscv_hwprobe __user *pairs, + size_t pair_count, size_t cpu_count, + unsigned long __user *cpus_user, + unsigned int flags) +{ + size_t out; + int ret; + cpumask_t cpus; + + /* Check the reserved flags. */ + if (flags != 0) + return -EINVAL; + + /* + * The interface supports taking in a CPU mask, and returns values that + * are consistent across that mask. Allow userspace to specify NULL and + * 0 as a shortcut to all online CPUs. + */ + cpumask_clear(&cpus); + if (!cpu_count && !cpus_user) { + cpumask_copy(&cpus, cpu_online_mask); + } else { + if (cpu_count > cpumask_size()) + cpu_count = cpumask_size(); + + ret = copy_from_user(&cpus, cpus_user, cpu_count); + if (ret) + return -EFAULT; + + /* + * Userspace must provide at least one online CPU, without that + * there's no way to define what is supported. + */ + cpumask_and(&cpus, &cpus, cpu_online_mask); + if (cpumask_empty(&cpus)) + return -EINVAL; + } + + for (out = 0; out < pair_count; out++, pairs++) { + struct riscv_hwprobe pair; + + if (get_user(pair.key, &pairs->key)) + return -EFAULT; + + pair.value = 0; + hwprobe_one_pair(&pair, &cpus); + ret = put_user(pair.key, &pairs->key); + if (ret == 0) + ret = put_user(pair.value, &pairs->value); + + if (ret) + return -EFAULT; + } + + return 0; +} + +SYSCALL_DEFINE5(riscv_hwprobe, struct riscv_hwprobe __user *, pairs, + size_t, pair_count, size_t, cpu_count, unsigned long __user *, + cpus, unsigned int, flags) +{ + return do_riscv_hwprobe(pairs, pair_count, cpu_count, + cpus, flags); +} -- cgit v1.2.3 From 00e76e2c6a2bd3976d44d4a1fdd0b7a3c2566607 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 7 Apr 2023 16:11:00 -0700 Subject: RISC-V: hwprobe: Add support for RISCV_HWPROBE_BASE_BEHAVIOR_IMA We have an implicit set of base behaviors that userspace depends on, which are mostly defined in various ISA specifications. Co-developed-by: Palmer Dabbelt Signed-off-by: Evan Green Reviewed-by: Conor Dooley Reviewed-by: Heiko Stuebner Tested-by: Heiko Stuebner Reviewed-by: Paul Walmsley Link: https://lore.kernel.org/r/20230407231103.2622178-4-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/riscv/hwprobe.rst | 24 ++++++++++++++++++++++++ arch/riscv/include/asm/hwprobe.h | 2 +- arch/riscv/include/uapi/asm/hwprobe.h | 5 +++++ arch/riscv/kernel/sys_riscv.c | 20 ++++++++++++++++++++ 4 files changed, 50 insertions(+), 1 deletion(-) diff --git a/Documentation/riscv/hwprobe.rst b/Documentation/riscv/hwprobe.rst index 211828f706e3..945d44683c40 100644 --- a/Documentation/riscv/hwprobe.rst +++ b/Documentation/riscv/hwprobe.rst @@ -39,3 +39,27 @@ The following keys are defined: * :c:macro:`RISCV_HWPROBE_KEY_MIMPLID`: Contains the value of ``mimplid``, as defined by the RISC-V privileged architecture specification. + +* :c:macro:`RISCV_HWPROBE_KEY_BASE_BEHAVIOR`: A bitmask containing the base + user-visible behavior that this kernel supports. The following base user ABIs + are defined: + + * :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: Support for rv32ima or + rv64ima, as defined by version 2.2 of the user ISA and version 1.10 of the + privileged ISA, with the following known exceptions (more exceptions may be + added, but only if it can be demonstrated that the user ABI is not broken): + + * The :fence.i: instruction cannot be directly executed by userspace + programs (it may still be executed in userspace via a + kernel-controlled mechanism such as the vDSO). + +* :c:macro:`RISCV_HWPROBE_KEY_IMA_EXT_0`: A bitmask containing the extensions + that are compatible with the :c:macro:`RISCV_HWPROBE_BASE_BEHAVIOR_IMA`: + base system behavior. + + * :c:macro:`RISCV_HWPROBE_IMA_FD`: The F and D extensions are supported, as + defined by commit cd20cee ("FMIN/FMAX now implement + minimumNumber/maximumNumber, not minNum/maxNum") of the RISC-V ISA manual. + + * :c:macro:`RISCV_HWPROBE_IMA_C`: The C extension is supported, as defined + by version 2.2 of the RISC-V ISA manual. diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index 6184bbc77256..d717c80a64ff 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,6 +8,6 @@ #include -#define RISCV_HWPROBE_MAX_KEY 2 +#define RISCV_HWPROBE_MAX_KEY 4 #endif diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index b79be00920db..398e08f7e083 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -20,6 +20,11 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_KEY_MVENDORID 0 #define RISCV_HWPROBE_KEY_MARCHID 1 #define RISCV_HWPROBE_KEY_MIMPID 2 +#define RISCV_HWPROBE_KEY_BASE_BEHAVIOR 3 +#define RISCV_HWPROBE_BASE_BEHAVIOR_IMA (1 << 0) +#define RISCV_HWPROBE_KEY_IMA_EXT_0 4 +#define RISCV_HWPROBE_IMA_FD (1 << 0) +#define RISCV_HWPROBE_IMA_C (1 << 1) /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ #endif diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index fe8e833ecb2e..5ca567cef142 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -125,6 +126,25 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, case RISCV_HWPROBE_KEY_MIMPID: hwprobe_arch_id(pair, cpus); break; + /* + * The kernel already assumes that the base single-letter ISA + * extensions are supported on all harts, and only supports the + * IMA base, so just cheat a bit here and tell that to + * userspace. + */ + case RISCV_HWPROBE_KEY_BASE_BEHAVIOR: + pair->value = RISCV_HWPROBE_BASE_BEHAVIOR_IMA; + break; + + case RISCV_HWPROBE_KEY_IMA_EXT_0: + pair->value = 0; + if (has_fpu()) + pair->value |= RISCV_HWPROBE_IMA_FD; + + if (riscv_isa_extension_available(NULL, c)) + pair->value |= RISCV_HWPROBE_IMA_C; + + break; /* * For forward compatibility, unknown keys don't fail the whole -- cgit v1.2.3 From 62a31d6e38bd0faef7c956b358d651f7bdc4ae0c Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 7 Apr 2023 16:11:01 -0700 Subject: RISC-V: hwprobe: Support probing of misaligned access performance This allows userspace to select various routines to use based on the performance of misaligned access on the target hardware. Rather than adding DT bindings, this change taps into the alternatives mechanism used to probe CPU errata. Add a new function pointer alongside the vendor-specific errata_patch_func() that probes for desirable errata (otherwise known as "features"). Unlike the errata_patch_func(), this function is called on each CPU as it comes up, so it can save feature information per-CPU. The T-head C906 has fast unaligned access, both as defined by GCC [1], and in performing a basic benchmark, which determined that byte copies are >50% slower than a misaligned word copy of the same data size (source for this test at [2]): bytecopy size f000 count 50000 offset 0 took 31664899 us wordcopy size f000 count 50000 offset 0 took 5180919 us wordcopy size f000 count 50000 offset 1 took 13416949 us [1] https://github.com/gcc-mirror/gcc/blob/master/gcc/config/riscv/riscv.cc#L353 [2] https://pastebin.com/EPXvDHSW Co-developed-by: Palmer Dabbelt Signed-off-by: Evan Green Reviewed-by: Heiko Stuebner Tested-by: Heiko Stuebner Reviewed-by: Conor Dooley Reviewed-by: Paul Walmsley Link: https://lore.kernel.org/r/20230407231103.2622178-5-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/riscv/hwprobe.rst | 21 +++++++++++++++++++++ arch/riscv/errata/thead/errata.c | 10 ++++++++++ arch/riscv/include/asm/alternative.h | 5 +++++ arch/riscv/include/asm/cpufeature.h | 2 ++ arch/riscv/include/asm/hwprobe.h | 2 +- arch/riscv/include/uapi/asm/hwprobe.h | 7 +++++++ arch/riscv/kernel/alternative.c | 19 +++++++++++++++++++ arch/riscv/kernel/cpufeature.c | 3 +++ arch/riscv/kernel/smpboot.c | 1 + arch/riscv/kernel/sys_riscv.c | 28 ++++++++++++++++++++++++++++ 10 files changed, 97 insertions(+), 1 deletion(-) diff --git a/Documentation/riscv/hwprobe.rst b/Documentation/riscv/hwprobe.rst index 945d44683c40..9f0dd62dcb5d 100644 --- a/Documentation/riscv/hwprobe.rst +++ b/Documentation/riscv/hwprobe.rst @@ -63,3 +63,24 @@ The following keys are defined: * :c:macro:`RISCV_HWPROBE_IMA_C`: The C extension is supported, as defined by version 2.2 of the RISC-V ISA manual. + +* :c:macro:`RISCV_HWPROBE_KEY_CPUPERF_0`: A bitmask that contains performance + information about the selected set of processors. + + * :c:macro:`RISCV_HWPROBE_MISALIGNED_UNKNOWN`: The performance of misaligned + accesses is unknown. + + * :c:macro:`RISCV_HWPROBE_MISALIGNED_EMULATED`: Misaligned accesses are + emulated via software, either in or below the kernel. These accesses are + always extremely slow. + + * :c:macro:`RISCV_HWPROBE_MISALIGNED_SLOW`: Misaligned accesses are supported + in hardware, but are slower than the cooresponding aligned accesses + sequences. + + * :c:macro:`RISCV_HWPROBE_MISALIGNED_FAST`: Misaligned accesses are supported + in hardware and are faster than the cooresponding aligned accesses + sequences. + + * :c:macro:`RISCV_HWPROBE_MISALIGNED_UNSUPPORTED`: Misaligned accesses are + not supported at all and will generate a misaligned address fault. diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index 3b96a06d3c54..5b6d62586a8b 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -11,7 +11,9 @@ #include #include #include +#include #include +#include #include #include @@ -115,3 +117,11 @@ void __init_or_module thead_errata_patch_func(struct alt_entry *begin, struct al if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) local_flush_icache_all(); } + +void __init_or_module thead_feature_probe_func(unsigned int cpu, + unsigned long archid, + unsigned long impid) +{ + if ((archid == 0) && (impid == 0)) + per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST; +} diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index b8648d4f2ac1..b5774e24d4a3 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -28,6 +28,7 @@ #define ALT_OLD_PTR(a) __ALT_PTR(a, old_offset) #define ALT_ALT_PTR(a) __ALT_PTR(a, alt_offset) +void __init probe_vendor_features(unsigned int cpu); void __init apply_boot_alternatives(void); void __init apply_early_boot_alternatives(void); void apply_module_alternatives(void *start, size_t length); @@ -55,11 +56,15 @@ void thead_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); +void thead_feature_probe_func(unsigned int cpu, unsigned long archid, + unsigned long impid); + void riscv_cpufeature_patch_func(struct alt_entry *begin, struct alt_entry *end, unsigned int stage); #else /* CONFIG_RISCV_ALTERNATIVE */ +static inline void probe_vendor_features(unsigned int cpu) { } static inline void apply_boot_alternatives(void) { } static inline void apply_early_boot_alternatives(void) { } static inline void apply_module_alternatives(void *start, size_t length) { } diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 66ebaae449c8..808d5403f2ac 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -18,4 +18,6 @@ struct riscv_cpuinfo { DECLARE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo); +DECLARE_PER_CPU(long, misaligned_access_speed); + #endif diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index d717c80a64ff..78936f4ff513 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,6 +8,6 @@ #include -#define RISCV_HWPROBE_MAX_KEY 4 +#define RISCV_HWPROBE_MAX_KEY 5 #endif diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index 398e08f7e083..8d745a4ad8a2 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -25,6 +25,13 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_KEY_IMA_EXT_0 4 #define RISCV_HWPROBE_IMA_FD (1 << 0) #define RISCV_HWPROBE_IMA_C (1 << 1) +#define RISCV_HWPROBE_KEY_CPUPERF_0 5 +#define RISCV_HWPROBE_MISALIGNED_UNKNOWN (0 << 0) +#define RISCV_HWPROBE_MISALIGNED_EMULATED (1 << 0) +#define RISCV_HWPROBE_MISALIGNED_SLOW (2 << 0) +#define RISCV_HWPROBE_MISALIGNED_FAST (3 << 0) +#define RISCV_HWPROBE_MISALIGNED_UNSUPPORTED (4 << 0) +#define RISCV_HWPROBE_MISALIGNED_MASK (7 << 0) /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ #endif diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c index 2354c69dc7d1..fc65c9293ac5 100644 --- a/arch/riscv/kernel/alternative.c +++ b/arch/riscv/kernel/alternative.c @@ -27,6 +27,8 @@ struct cpu_manufacturer_info_t { void (*patch_func)(struct alt_entry *begin, struct alt_entry *end, unsigned long archid, unsigned long impid, unsigned int stage); + void (*feature_probe_func)(unsigned int cpu, unsigned long archid, + unsigned long impid); }; static void __init_or_module riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info) @@ -41,6 +43,7 @@ static void __init_or_module riscv_fill_cpu_mfr_info(struct cpu_manufacturer_inf cpu_mfr_info->imp_id = sbi_get_mimpid(); #endif + cpu_mfr_info->feature_probe_func = NULL; switch (cpu_mfr_info->vendor_id) { #ifdef CONFIG_ERRATA_SIFIVE case SIFIVE_VENDOR_ID: @@ -50,6 +53,7 @@ static void __init_or_module riscv_fill_cpu_mfr_info(struct cpu_manufacturer_inf #ifdef CONFIG_ERRATA_THEAD case THEAD_VENDOR_ID: cpu_mfr_info->patch_func = thead_errata_patch_func; + cpu_mfr_info->feature_probe_func = thead_feature_probe_func; break; #endif default: @@ -139,6 +143,20 @@ void riscv_alternative_fix_offsets(void *alt_ptr, unsigned int len, } } +/* Called on each CPU as it starts */ +void __init_or_module probe_vendor_features(unsigned int cpu) +{ + struct cpu_manufacturer_info_t cpu_mfr_info; + + riscv_fill_cpu_mfr_info(&cpu_mfr_info); + if (!cpu_mfr_info.feature_probe_func) + return; + + cpu_mfr_info.feature_probe_func(cpu, + cpu_mfr_info.arch_id, + cpu_mfr_info.imp_id); +} + /* * This is called very early in the boot process (directly after we run * a feature detect on the boot CPU). No need to worry about other CPUs @@ -193,6 +211,7 @@ void __init apply_boot_alternatives(void) /* If called on non-boot cpu things could go wrong */ WARN_ON(smp_processor_id() != 0); + probe_vendor_features(0); _apply_alternatives((struct alt_entry *)__alt_start, (struct alt_entry *)__alt_end, RISCV_ALTERNATIVES_BOOT); diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 59d58ee0f68d..8bbc89351050 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -30,6 +30,9 @@ unsigned long elf_hwcap __read_mostly; /* Host ISA bitmap */ static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly; +/* Performance information */ +DEFINE_PER_CPU(long, misaligned_access_speed); + /** * riscv_isa_extension_base() - Get base extension word * diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index ddb2afba6d25..2867c12c3d16 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -168,6 +168,7 @@ asmlinkage __visible void smp_callin(void) notify_cpu_starting(curr_cpuid); numa_add_cpu(curr_cpuid); set_cpu_online(curr_cpuid, 1); + probe_vendor_features(curr_cpuid); /* * Remote TLB flushes are ignored while the CPU is offline, so emit diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index 5ca567cef142..55389e7595f6 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -117,6 +118,29 @@ static void hwprobe_arch_id(struct riscv_hwprobe *pair, pair->value = id; } +static u64 hwprobe_misaligned(const struct cpumask *cpus) +{ + int cpu; + u64 perf = -1ULL; + + for_each_cpu(cpu, cpus) { + int this_perf = per_cpu(misaligned_access_speed, cpu); + + if (perf == -1ULL) + perf = this_perf; + + if (perf != this_perf) { + perf = RISCV_HWPROBE_MISALIGNED_UNKNOWN; + break; + } + } + + if (perf == -1ULL) + return RISCV_HWPROBE_MISALIGNED_UNKNOWN; + + return perf; +} + static void hwprobe_one_pair(struct riscv_hwprobe *pair, const struct cpumask *cpus) { @@ -146,6 +170,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, break; + case RISCV_HWPROBE_KEY_CPUPERF_0: + pair->value = hwprobe_misaligned(cpus); + break; + /* * For forward compatibility, unknown keys don't fail the whole * call, but get their element key set to -1 and value set to 0 -- cgit v1.2.3 From 287dcc2b0c831d3e3887421f68e3db6b8b8f4eb1 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 7 Apr 2023 16:11:02 -0700 Subject: selftests: Test the new RISC-V hwprobe interface This adds a test for the recently added RISC-V interface for probing hardware capabilities. It happens to be the first selftest we have for RISC-V, so I've added some infrastructure for those as well. Co-developed-by: Palmer Dabbelt Signed-off-by: Evan Green Link: https://lore.kernel.org/r/20230407231103.2622178-6-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/riscv/Makefile | 58 ++++++++++++++ tools/testing/selftests/riscv/hwprobe/Makefile | 10 +++ tools/testing/selftests/riscv/hwprobe/hwprobe.c | 90 ++++++++++++++++++++++ .../testing/selftests/riscv/hwprobe/sys_hwprobe.S | 12 +++ 5 files changed, 171 insertions(+) create mode 100644 tools/testing/selftests/riscv/Makefile create mode 100644 tools/testing/selftests/riscv/hwprobe/Makefile create mode 100644 tools/testing/selftests/riscv/hwprobe/hwprobe.c create mode 100644 tools/testing/selftests/riscv/hwprobe/sys_hwprobe.S diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 13a6837a0c6b..4bea26109450 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -63,6 +63,7 @@ TARGETS += pstore TARGETS += ptrace TARGETS += openat2 TARGETS += resctrl +TARGETS += riscv TARGETS += rlimits TARGETS += rseq TARGETS += rtc diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile new file mode 100644 index 000000000000..32a72902d045 --- /dev/null +++ b/tools/testing/selftests/riscv/Makefile @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: GPL-2.0 +# Originally tools/testing/arm64/Makefile + +# When ARCH not overridden for crosscompiling, lookup machine +ARCH ?= $(shell uname -m 2>/dev/null || echo not) + +ifneq (,$(filter $(ARCH),riscv)) +RISCV_SUBTARGETS ?= hwprobe +else +RISCV_SUBTARGETS := +endif + +CFLAGS := -Wall -O2 -g + +# A proper top_srcdir is needed by KSFT(lib.mk) +top_srcdir = $(realpath ../../../../) + +# Additional include paths needed by kselftest.h and local headers +CFLAGS += -I$(top_srcdir)/tools/testing/selftests/ + +CFLAGS += $(KHDR_INCLUDES) + +export CFLAGS +export top_srcdir + +all: + @for DIR in $(RISCV_SUBTARGETS); do \ + BUILD_TARGET=$(OUTPUT)/$$DIR; \ + mkdir -p $$BUILD_TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ + done + +install: all + @for DIR in $(RISCV_SUBTARGETS); do \ + BUILD_TARGET=$(OUTPUT)/$$DIR; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ + done + +run_tests: all + @for DIR in $(RISCV_SUBTARGETS); do \ + BUILD_TARGET=$(OUTPUT)/$$DIR; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ + done + +# Avoid any output on non riscv on emit_tests +emit_tests: all + @for DIR in $(RISCV_SUBTARGETS); do \ + BUILD_TARGET=$(OUTPUT)/$$DIR; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ + done + +clean: + @for DIR in $(RISCV_SUBTARGETS); do \ + BUILD_TARGET=$(OUTPUT)/$$DIR; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ + done + +.PHONY: all clean install run_tests emit_tests diff --git a/tools/testing/selftests/riscv/hwprobe/Makefile b/tools/testing/selftests/riscv/hwprobe/Makefile new file mode 100644 index 000000000000..ebdbb3c22e54 --- /dev/null +++ b/tools/testing/selftests/riscv/hwprobe/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2021 ARM Limited +# Originally tools/testing/arm64/abi/Makefile + +TEST_GEN_PROGS := hwprobe + +include ../../lib.mk + +$(OUTPUT)/hwprobe: hwprobe.c sys_hwprobe.S + $(CC) -o$@ $(CFLAGS) $(LDFLAGS) $^ diff --git a/tools/testing/selftests/riscv/hwprobe/hwprobe.c b/tools/testing/selftests/riscv/hwprobe/hwprobe.c new file mode 100644 index 000000000000..09f290a67420 --- /dev/null +++ b/tools/testing/selftests/riscv/hwprobe/hwprobe.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +/* + * Rather than relying on having a new enough libc to define this, just do it + * ourselves. This way we don't need to be coupled to a new-enough libc to + * contain the call. + */ +long riscv_hwprobe(struct riscv_hwprobe *pairs, size_t pair_count, + size_t cpu_count, unsigned long *cpus, unsigned int flags); + +int main(int argc, char **argv) +{ + struct riscv_hwprobe pairs[8]; + unsigned long cpus; + long out; + + /* Fake the CPU_SET ops. */ + cpus = -1; + + /* + * Just run a basic test: pass enough pairs to get up to the base + * behavior, and then check to make sure it's sane. + */ + for (long i = 0; i < 8; i++) + pairs[i].key = i; + out = riscv_hwprobe(pairs, 8, 1, &cpus, 0); + if (out != 0) + return -1; + for (long i = 0; i < 4; ++i) { + /* Fail if the kernel claims not to recognize a base key. */ + if ((i < 4) && (pairs[i].key != i)) + return -2; + + if (pairs[i].key != RISCV_HWPROBE_KEY_BASE_BEHAVIOR) + continue; + + if (pairs[i].value & RISCV_HWPROBE_BASE_BEHAVIOR_IMA) + continue; + + return -3; + } + + /* + * This should also work with a NULL CPU set, but should not work + * with an improperly supplied CPU set. + */ + out = riscv_hwprobe(pairs, 8, 0, 0, 0); + if (out != 0) + return -4; + + out = riscv_hwprobe(pairs, 8, 0, &cpus, 0); + if (out == 0) + return -5; + + out = riscv_hwprobe(pairs, 8, 1, 0, 0); + if (out == 0) + return -6; + + /* + * Check that keys work by providing one that we know exists, and + * checking to make sure the resultig pair is what we asked for. + */ + pairs[0].key = RISCV_HWPROBE_KEY_BASE_BEHAVIOR; + out = riscv_hwprobe(pairs, 1, 1, &cpus, 0); + if (out != 0) + return -7; + if (pairs[0].key != RISCV_HWPROBE_KEY_BASE_BEHAVIOR) + return -8; + + /* + * Check that an unknown key gets overwritten with -1, + * but doesn't block elements after it. + */ + pairs[0].key = 0x5555; + pairs[1].key = 1; + pairs[1].value = 0xAAAA; + out = riscv_hwprobe(pairs, 2, 0, 0, 0); + if (out != 0) + return -9; + + if (pairs[0].key != -1) + return -10; + + if ((pairs[1].key != 1) || (pairs[1].value == 0xAAAA)) + return -11; + + return 0; +} diff --git a/tools/testing/selftests/riscv/hwprobe/sys_hwprobe.S b/tools/testing/selftests/riscv/hwprobe/sys_hwprobe.S new file mode 100644 index 000000000000..a4773c88d267 --- /dev/null +++ b/tools/testing/selftests/riscv/hwprobe/sys_hwprobe.S @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2023 Rivos, Inc */ + +.text +.global riscv_hwprobe +riscv_hwprobe: + # Put __NR_riscv_hwprobe in the syscall number register, then just shim + # back the kernel's return. This doesn't do any sort of errno + # handling, the caller can deal with it. + li a7, 258 + ecall + ret -- cgit v1.2.3 From aa5af0aa90bad3f1cad5a90ee5eecd92ac9f3096 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 7 Apr 2023 16:11:03 -0700 Subject: RISC-V: Add hwprobe vDSO function and data Add a vDSO function __vdso_riscv_hwprobe, which can sit in front of the riscv_hwprobe syscall and answer common queries. We stash a copy of static answers for the "all CPUs" case in the vDSO data page. This data is private to the vDSO, so we can decide later to change what's stored there or under what conditions we defer to the syscall. Currently all data can be discovered at boot, so the vDSO function answers all queries when the cpumask is set to the "all CPUs" hint. There's also a boolean in the data that lets the vDSO function know that all CPUs are the same. In that case, the vDSO will also answer queries for arbitrary CPU masks in addition to the "all CPUs" hint. Signed-off-by: Evan Green Link: https://lore.kernel.org/r/20230407231103.2622178-7-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/vdso/data.h | 17 ++++++++++ arch/riscv/include/asm/vdso/gettimeofday.h | 8 +++++ arch/riscv/kernel/compat_vdso/Makefile | 2 +- arch/riscv/kernel/sys_riscv.c | 45 ++++++++++++++++++++++++++ arch/riscv/kernel/vdso.c | 6 ---- arch/riscv/kernel/vdso/Makefile | 4 +++ arch/riscv/kernel/vdso/hwprobe.c | 52 ++++++++++++++++++++++++++++++ arch/riscv/kernel/vdso/sys_hwprobe.S | 15 +++++++++ arch/riscv/kernel/vdso/vdso.lds.S | 3 ++ 10 files changed, 146 insertions(+), 7 deletions(-) create mode 100644 arch/riscv/include/asm/vdso/data.h create mode 100644 arch/riscv/kernel/vdso/hwprobe.c create mode 100644 arch/riscv/kernel/vdso/sys_hwprobe.S diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c5e42cc37604..a20e105efa4e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -33,6 +33,7 @@ config RISCV select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VDSO_DATA select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT select ARCH_STACKWALK diff --git a/arch/riscv/include/asm/vdso/data.h b/arch/riscv/include/asm/vdso/data.h new file mode 100644 index 000000000000..dc2f76f58b76 --- /dev/null +++ b/arch/riscv/include/asm/vdso/data.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __RISCV_ASM_VDSO_DATA_H +#define __RISCV_ASM_VDSO_DATA_H + +#include +#include +#include + +struct arch_vdso_data { + /* Stash static answers to the hwprobe queries when all CPUs are selected. */ + __u64 all_cpu_hwprobe_values[RISCV_HWPROBE_MAX_KEY + 1]; + + /* Boolean indicating all CPUs have the same static hwprobe values. */ + __u8 homogeneous_cpus; +}; + +#endif /* __RISCV_ASM_VDSO_DATA_H */ diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h index 77d9c2f721c4..ba3283cf7acc 100644 --- a/arch/riscv/include/asm/vdso/gettimeofday.h +++ b/arch/riscv/include/asm/vdso/gettimeofday.h @@ -9,6 +9,12 @@ #include #include +/* + * 32-bit land is lacking generic time vsyscalls as well as the legacy 32-bit + * time syscalls like gettimeofday. Skip these definitions since on 32-bit. + */ +#ifdef CONFIG_GENERIC_TIME_VSYSCALL + #define VDSO_HAS_CLOCK_GETRES 1 static __always_inline @@ -60,6 +66,8 @@ int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) return ret; } +#endif /* CONFIG_GENERIC_TIME_VSYSCALL */ + static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_data *vd) { diff --git a/arch/riscv/kernel/compat_vdso/Makefile b/arch/riscv/kernel/compat_vdso/Makefile index 260daf3236d3..91a3431ae1fe 100644 --- a/arch/riscv/kernel/compat_vdso/Makefile +++ b/arch/riscv/kernel/compat_vdso/Makefile @@ -22,7 +22,7 @@ targets := $(obj-compat_vdso) compat_vdso.so compat_vdso.so.dbg compat_vdso.lds obj-compat_vdso := $(addprefix $(obj)/, $(obj-compat_vdso)) obj-y += compat_vdso.o -CPPFLAGS_compat_vdso.lds += -P -C -U$(ARCH) +CPPFLAGS_compat_vdso.lds += -P -C -DCOMPAT_VDSO -U$(ARCH) # Disable profiling and instrumentation for VDSO code GCOV_PROFILE := n diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index 55389e7595f6..849b4170629d 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -14,6 +14,7 @@ #include #include #include +#include static long riscv_sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, @@ -243,6 +244,50 @@ static int do_riscv_hwprobe(struct riscv_hwprobe __user *pairs, return 0; } +#ifdef CONFIG_MMU + +static int __init init_hwprobe_vdso_data(void) +{ + struct vdso_data *vd = __arch_get_k_vdso_data(); + struct arch_vdso_data *avd = &vd->arch_data; + u64 id_bitsmash = 0; + struct riscv_hwprobe pair; + int key; + + /* + * Initialize vDSO data with the answers for the "all CPUs" case, to + * save a syscall in the common case. + */ + for (key = 0; key <= RISCV_HWPROBE_MAX_KEY; key++) { + pair.key = key; + hwprobe_one_pair(&pair, cpu_online_mask); + + WARN_ON_ONCE(pair.key < 0); + + avd->all_cpu_hwprobe_values[key] = pair.value; + /* + * Smash together the vendor, arch, and impl IDs to see if + * they're all 0 or any negative. + */ + if (key <= RISCV_HWPROBE_KEY_MIMPID) + id_bitsmash |= pair.value; + } + + /* + * If the arch, vendor, and implementation ID are all the same across + * all harts, then assume all CPUs are the same, and allow the vDSO to + * answer queries for arbitrary masks. However if all values are 0 (not + * populated) or any value returns -1 (varies across CPUs), then the + * vDSO should defer to the kernel for exotic cpu masks. + */ + avd->homogeneous_cpus = (id_bitsmash > 0); + return 0; +} + +arch_initcall_sync(init_hwprobe_vdso_data); + +#endif /* CONFIG_MMU */ + SYSCALL_DEFINE5(riscv_hwprobe, struct riscv_hwprobe __user *, pairs, size_t, pair_count, size_t, cpu_count, unsigned long __user *, cpus, unsigned int, flags) diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index cc2d1e8c8736..9a68e7eaae4d 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -14,13 +14,7 @@ #include #include #include - -#ifdef CONFIG_GENERIC_TIME_VSYSCALL #include -#else -struct vdso_data { -}; -#endif enum vvar_pages { VVAR_DATA_PAGE_OFFSET, diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 06e6b27f3bcc..022258426050 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -12,6 +12,8 @@ vdso-syms += vgettimeofday endif vdso-syms += getcpu vdso-syms += flush_icache +vdso-syms += hwprobe +vdso-syms += sys_hwprobe # Files to link into the vdso obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o @@ -23,6 +25,8 @@ ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y) endif +CFLAGS_hwprobe.o += -fPIC + # Build rules targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) diff --git a/arch/riscv/kernel/vdso/hwprobe.c b/arch/riscv/kernel/vdso/hwprobe.c new file mode 100644 index 000000000000..d40bec6ac078 --- /dev/null +++ b/arch/riscv/kernel/vdso/hwprobe.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2023 Rivos, Inc + */ + +#include +#include +#include + +extern int riscv_hwprobe(struct riscv_hwprobe *pairs, size_t pair_count, + size_t cpu_count, unsigned long *cpus, + unsigned int flags); + +/* Add a prototype to avoid -Wmissing-prototypes warning. */ +int __vdso_riscv_hwprobe(struct riscv_hwprobe *pairs, size_t pair_count, + size_t cpu_count, unsigned long *cpus, + unsigned int flags); + +int __vdso_riscv_hwprobe(struct riscv_hwprobe *pairs, size_t pair_count, + size_t cpu_count, unsigned long *cpus, + unsigned int flags) +{ + const struct vdso_data *vd = __arch_get_vdso_data(); + const struct arch_vdso_data *avd = &vd->arch_data; + bool all_cpus = !cpu_count && !cpus; + struct riscv_hwprobe *p = pairs; + struct riscv_hwprobe *end = pairs + pair_count; + + /* + * Defer to the syscall for exotic requests. The vdso has answers + * stashed away only for the "all cpus" case. If all CPUs are + * homogeneous, then this function can handle requests for arbitrary + * masks. + */ + if ((flags != 0) || (!all_cpus && !avd->homogeneous_cpus)) + return riscv_hwprobe(pairs, pair_count, cpu_count, cpus, flags); + + /* This is something we can handle, fill out the pairs. */ + while (p < end) { + if (p->key <= RISCV_HWPROBE_MAX_KEY) { + p->value = avd->all_cpu_hwprobe_values[p->key]; + + } else { + p->key = -1; + p->value = 0; + } + + p++; + } + + return 0; +} diff --git a/arch/riscv/kernel/vdso/sys_hwprobe.S b/arch/riscv/kernel/vdso/sys_hwprobe.S new file mode 100644 index 000000000000..4e704146c77a --- /dev/null +++ b/arch/riscv/kernel/vdso/sys_hwprobe.S @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2023 Rivos, Inc */ + +#include +#include + +.text +ENTRY(riscv_hwprobe) + .cfi_startproc + li a7, __NR_riscv_hwprobe + ecall + ret + + .cfi_endproc +ENDPROC(riscv_hwprobe) diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index 4a0606633290..82ce64900f3d 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -82,6 +82,9 @@ VERSION #endif __vdso_getcpu; __vdso_flush_icache; +#ifndef COMPAT_VDSO + __vdso_riscv_hwprobe; +#endif local: *; }; } -- cgit v1.2.3 From c4b52d8b6c1de1e6359bef2d1394d5917940b3dc Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Thu, 23 Mar 2023 20:39:24 +0800 Subject: riscv: export cpu/freq invariant to scheduler RISC-V now manages CPU topology using arch_topology which provides CPU capacity and frequency related interfaces to access the cpu/freq invariant in possible heterogeneous or DVFS-enabled platforms. Here adds topology.h file to export the arch_topology interfaces for replacing the scheduler's constant-based cpu/freq invariant accounting. Signed-off-by: Song Shuai Reviewed-by: Andrew Jones Reviewed-by: Ley Foon Tan Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230323123924.3032174-1-suagrfillet@gmail.com [Palmer: Fix the whitespace issues.] Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/topology.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 arch/riscv/include/asm/topology.h diff --git a/arch/riscv/include/asm/topology.h b/arch/riscv/include/asm/topology.h new file mode 100644 index 000000000000..727e8d163a3b --- /dev/null +++ b/arch/riscv/include/asm/topology.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_TOPOLOGY_H +#define _ASM_RISCV_TOPOLOGY_H + +#include + +/* Replace task scheduler's default frequency-invariant accounting */ +#define arch_scale_freq_tick topology_scale_freq_tick +#define arch_set_freq_scale topology_set_freq_scale +#define arch_scale_freq_capacity topology_get_freq_scale +#define arch_scale_freq_invariant topology_scale_freq_invariant + +/* Replace task scheduler's default cpu-invariant accounting */ +#define arch_scale_cpu_capacity topology_get_cpu_scale + +/* Enable topology flag updates */ +#define arch_update_cpu_topology topology_update_cpu_topology + +#include +#endif /* _ASM_RISCV_TOPOLOGY_H */ -- cgit v1.2.3 From 8bf7b3b6676270c0ed3e4968608e1e052ecc8606 Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Fri, 10 Mar 2023 19:03:36 +0800 Subject: riscv: Kconfig: enable SCHED_MC kconfig RISC-V now builds the sched domain based on the simple possible map. Enable SCHED_MC to make the building based on cpu_coregroup_mask() which also takes care of the NUMA and cores with LLC. Signed-off-by: Song Shuai Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20230310110336.970985-1-suagrfillet@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index aaa11fb48b86..2db7b1b1b85c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -322,6 +322,14 @@ config SMP If you don't know what to do here, say N. +config SCHED_MC + bool "Multi-core scheduler support" + depends on SMP + help + Multi-core scheduler support improves the CPU scheduler's decision + making when dealing with multi-core CPU chips at a cost of slightly + increased overhead in some places. If unsure say N here. + config NR_CPUS int "Maximum number of CPUs (2-512)" depends on SMP -- cgit v1.2.3 From 5464912cfae706aff47f6253c495bf81284bc5d5 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 5 Apr 2023 11:21:10 +0100 Subject: RISC-V: align ISA extension Kconfig help text with each other Other extensions only capitalise the first letter in the text visible in Kconfig menus, and provide a short comment about the extension's meaning. Do the same for Svnapot & Svpbmt. The precedent for capitalisation in the Kconfig text was set by Zicbom & sorta followed for Zicboz. The RVI styling used for multi-letter extensions only capitalises the first letter, so do the same here. If nothing else, my OCD likes it when the extensions follow a consistent pattern. While editing one of the lines, reformat the "spelling" of 64-bit. Signed-off-by: Conor Dooley Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20230405-pucker-cogwheel-3a999a94a2f2@wendy Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 2db7b1b1b85c..ba94d08a7d0e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -409,31 +409,31 @@ config RISCV_ISA_C If you don't know what to do here, say Y. config RISCV_ISA_SVNAPOT - bool "SVNAPOT extension support" + bool "Svnapot extension support for supervisor mode NAPOT pages" depends on 64BIT && MMU depends on RISCV_ALTERNATIVE default y help - Allow kernel to detect the SVNAPOT ISA-extension dynamically at boot + Allow kernel to detect the Svnapot ISA-extension dynamically at boot time and enable its usage. - The SVNAPOT extension is used to mark contiguous PTEs as a range + The Svnapot extension is used to mark contiguous PTEs as a range of contiguous virtual-to-physical translations for a naturally aligned power-of-2 (NAPOT) granularity larger than the base 4KB page size. When HUGETLBFS is also selected this option unconditionally allocates some memory for each NAPOT page size supported by the kernel. When optimizing for low memory consumption and for platforms without - the SVNAPOT extension, it may be better to say N here. + the Svnapot extension, it may be better to say N here. If you don't know what to do here, say Y. config RISCV_ISA_SVPBMT - bool "SVPBMT extension support" + bool "Svpbmt extension support for supervisor mode page-based memory types" depends on 64BIT && MMU depends on RISCV_ALTERNATIVE default y help - Adds support to dynamically detect the presence of the SVPBMT + Adds support to dynamically detect the presence of the Svpbmt ISA-extension (Supervisor-mode: page-based memory types) and enable its usage. @@ -441,7 +441,7 @@ config RISCV_ISA_SVPBMT that indicate the cacheability, idempotency, and ordering properties for access to that page. - The SVPBMT extension is only available on 64Bit cpus. + The Svpbmt extension is only available on 64-bit cpus. If you don't know what to do here, say Y. @@ -491,7 +491,7 @@ config RISCV_ISA_ZICBOZ depends on RISCV_ALTERNATIVE default y help - Enable the use of the ZICBOZ extension (cbo.zero instruction) + Enable the use of the Zicboz extension (cbo.zero instruction) when available. The Zicboz extension is used for faster zeroing of memory. -- cgit v1.2.3 From a7407a1318a9d7b5a7de7ab19935de5fc4c47a57 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 24 Mar 2023 16:54:19 +0100 Subject: riscv: Get rid of riscv_pfn_base variable Use directly phys_ram_base instead, riscv_pfn_base is just the pfn of the address contained in phys_ram_base. Even if there is no functional change intended in this patch, actually setting phys_ram_base that early changes the behaviour of kernel_mapping_pa_to_va during the early boot: phys_ram_base used to be zero before this patch and now it is set to the physical start address of the kernel. But it does not break the conversion of a kernel physical address into a virtual address since kernel_mapping_pa_to_va should only be used on kernel physical addresses, i.e. addresses greater than the physical start address of the kernel. Signed-off-by: Alexandre Ghiti Reviewed-by: Andrew Jones Reviewed-by: Anup Patel Tested-by: Anup Patel Link: https://lore.kernel.org/r/20230324155421.271544-2-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/page.h | 3 +-- arch/riscv/mm/init.c | 6 +----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 7fed7c431928..8dc686f549b6 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -91,8 +91,7 @@ typedef struct page *pgtable_t; #endif #ifdef CONFIG_MMU -extern unsigned long riscv_pfn_base; -#define ARCH_PFN_OFFSET (riscv_pfn_base) +#define ARCH_PFN_OFFSET (PFN_DOWN((unsigned long)phys_ram_base)) #else #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 478d6763a01a..225a7d2b65cc 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -271,9 +271,6 @@ static void __init setup_bootmem(void) #ifdef CONFIG_MMU struct pt_alloc_ops pt_ops __initdata; -unsigned long riscv_pfn_base __ro_after_init; -EXPORT_SYMBOL(riscv_pfn_base); - pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; @@ -285,7 +282,6 @@ static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAG #ifdef CONFIG_XIP_KERNEL #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) -#define riscv_pfn_base (*(unsigned long *)XIP_FIXUP(&riscv_pfn_base)) #define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir)) #define fixmap_pte ((pte_t *)XIP_FIXUP(fixmap_pte)) #define early_pg_dir ((pgd_t *)XIP_FIXUP(early_pg_dir)) @@ -985,7 +981,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr; kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; - riscv_pfn_base = PFN_DOWN(kernel_map.phys_addr); + phys_ram_base = kernel_map.phys_addr; /* * The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit -- cgit v1.2.3 From 8589e346bbb679cf7a4b564f75295d94250058f0 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 24 Mar 2023 16:54:20 +0100 Subject: riscv: Move the linear mapping creation in its own function No change intended, it just splits the linear mapping creation from setup_vm_final: this prepares for upcoming additions to the linear mapping creation. Signed-off-by: Alexandre Ghiti Reviewed-by: Andrew Jones Reviewed-by: Anup Patel Tested-by: Anup Patel Link: https://lore.kernel.org/r/20230324155421.271544-3-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 225a7d2b65cc..ed07ae111f3a 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1086,16 +1086,25 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pt_ops_set_fixmap(); } -static void __init setup_vm_final(void) +static void __init create_linear_mapping_range(phys_addr_t start, + phys_addr_t end) { + phys_addr_t pa; uintptr_t va, map_size; - phys_addr_t pa, start, end; - u64 i; - /* Setup swapper PGD for fixmap */ - create_pgd_mapping(swapper_pg_dir, FIXADDR_START, - __pa_symbol(fixmap_pgd_next), - PGDIR_SIZE, PAGE_TABLE); + for (pa = start; pa < end; pa += map_size) { + va = (uintptr_t)__va(pa); + map_size = best_map_size(pa, end - pa); + + create_pgd_mapping(swapper_pg_dir, va, pa, map_size, + pgprot_from_va(va)); + } +} + +static void __init create_linear_mapping_page_table(void) +{ + phys_addr_t start, end; + u64 i; /* Map all memory banks in the linear mapping */ for_each_mem_range(i, &start, &end) { @@ -1107,14 +1116,19 @@ static void __init setup_vm_final(void) if (end >= __pa(PAGE_OFFSET) + memory_limit) end = __pa(PAGE_OFFSET) + memory_limit; - for (pa = start; pa < end; pa += map_size) { - va = (uintptr_t)__va(pa); - map_size = best_map_size(pa, end - pa); - - create_pgd_mapping(swapper_pg_dir, va, pa, map_size, - pgprot_from_va(va)); - } + create_linear_mapping_range(start, end); } +} + +static void __init setup_vm_final(void) +{ + /* Setup swapper PGD for fixmap */ + create_pgd_mapping(swapper_pg_dir, FIXADDR_START, + __pa_symbol(fixmap_pgd_next), + PGDIR_SIZE, PAGE_TABLE); + + /* Map the linear mapping */ + create_linear_mapping_page_table(); /* Map the kernel */ if (IS_ENABLED(CONFIG_64BIT)) -- cgit v1.2.3 From 3335068f87217ea59d08f462187dc856652eea15 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 24 Mar 2023 16:54:21 +0100 Subject: riscv: Use PUD/P4D/PGD pages for the linear mapping During the early page table creation, we used to set the mapping for PAGE_OFFSET to the kernel load address: but the kernel load address is always offseted by PMD_SIZE which makes it impossible to use PUD/P4D/PGD pages as this physical address is not aligned on PUD/P4D/PGD size (whereas PAGE_OFFSET is). But actually we don't have to establish this mapping (ie set va_pa_offset) that early in the boot process because: - first, setup_vm installs a temporary kernel mapping and among other things, discovers the system memory, - then, setup_vm_final creates the final kernel mapping and takes advantage of the discovered system memory to create the linear mapping. During the first phase, we don't know the start of the system memory and then until the second phase is finished, we can't use the linear mapping at all and phys_to_virt/virt_to_phys translations must not be used because it would result in a different translation from the 'real' one once the final mapping is installed. So here we simply delay the initialization of va_pa_offset to after the system memory discovery. But to make sure noone uses the linear mapping before, we add some guard in the DEBUG_VIRTUAL config. Finally we can use PUD/P4D/PGD hugepages when possible, which will result in a better TLB utilization. Note that: - this does not apply to rv32 as the kernel mapping lies in the linear mapping. - we rely on the firmware to protect itself using PMP. Signed-off-by: Alexandre Ghiti Acked-by: Rob Herring # DT bits Reviewed-by: Andrew Jones Reviewed-by: Anup Patel Tested-by: Anup Patel Link: https://lore.kernel.org/r/20230324155421.271544-4-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/page.h | 16 ++++++++++++ arch/riscv/mm/init.c | 58 ++++++++++++++++++++++++++++++++++++++----- arch/riscv/mm/physaddr.c | 16 ++++++++++++ drivers/of/fdt.c | 11 ++++---- 4 files changed, 90 insertions(+), 11 deletions(-) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 8dc686f549b6..ea1a0e237211 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -90,6 +90,14 @@ typedef struct page *pgtable_t; #define PTE_FMT "%08lx" #endif +#ifdef CONFIG_64BIT +/* + * We override this value as its generic definition uses __pa too early in + * the boot process (before kernel_map.va_pa_offset is set). + */ +#define MIN_MEMBLOCK_ADDR 0 +#endif + #ifdef CONFIG_MMU #define ARCH_PFN_OFFSET (PFN_DOWN((unsigned long)phys_ram_base)) #else @@ -121,7 +129,11 @@ extern phys_addr_t phys_ram_base; #define is_linear_mapping(x) \ ((x) >= PAGE_OFFSET && (!IS_ENABLED(CONFIG_64BIT) || (x) < PAGE_OFFSET + KERN_VIRT_SIZE)) +#ifndef CONFIG_DEBUG_VIRTUAL #define linear_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + kernel_map.va_pa_offset)) +#else +void *linear_mapping_pa_to_va(unsigned long x); +#endif #define kernel_mapping_pa_to_va(y) ({ \ unsigned long _y = (unsigned long)(y); \ (IS_ENABLED(CONFIG_XIP_KERNEL) && _y < phys_ram_base) ? \ @@ -130,7 +142,11 @@ extern phys_addr_t phys_ram_base; }) #define __pa_to_va_nodebug(x) linear_mapping_pa_to_va(x) +#ifndef CONFIG_DEBUG_VIRTUAL #define linear_mapping_va_to_pa(x) ((unsigned long)(x) - kernel_map.va_pa_offset) +#else +phys_addr_t linear_mapping_va_to_pa(unsigned long x); +#endif #define kernel_mapping_va_to_pa(y) ({ \ unsigned long _y = (unsigned long)(y); \ (IS_ENABLED(CONFIG_XIP_KERNEL) && _y < kernel_map.virt_addr + XIP_OFFSET) ? \ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index ed07ae111f3a..7bd66795165d 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -213,6 +213,14 @@ static void __init setup_bootmem(void) phys_ram_end = memblock_end_of_DRAM(); if (!IS_ENABLED(CONFIG_XIP_KERNEL)) phys_ram_base = memblock_start_of_DRAM(); + + /* + * In 64-bit, any use of __va/__pa before this point is wrong as we + * did not know the start of DRAM before. + */ + if (IS_ENABLED(CONFIG_64BIT)) + kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base; + /* * memblock allocator is not aware of the fact that last 4K bytes of * the addressable memory can not be mapped because of IS_ERR_VALUE @@ -667,9 +675,16 @@ void __init create_pgd_mapping(pgd_t *pgdp, static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) { - /* Upgrade to PMD_SIZE mappings whenever possible */ - base &= PMD_SIZE - 1; - if (!base && size >= PMD_SIZE) + if (!(base & (PGDIR_SIZE - 1)) && size >= PGDIR_SIZE) + return PGDIR_SIZE; + + if (!(base & (P4D_SIZE - 1)) && size >= P4D_SIZE) + return P4D_SIZE; + + if (!(base & (PUD_SIZE - 1)) && size >= PUD_SIZE) + return PUD_SIZE; + + if (!(base & (PMD_SIZE - 1)) && size >= PMD_SIZE) return PMD_SIZE; return PAGE_SIZE; @@ -978,11 +993,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) set_satp_mode(); #endif - kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr; + /* + * In 64-bit, we defer the setup of va_pa_offset to setup_bootmem, + * where we have the system memory layout: this allows us to align + * the physical and virtual mappings and then make use of PUD/P4D/PGD + * for the linear mapping. This is only possible because the kernel + * mapping lies outside the linear mapping. + * In 32-bit however, as the kernel resides in the linear mapping, + * setup_vm_final can not change the mapping established here, + * otherwise the same kernel addresses would get mapped to different + * physical addresses (if the start of dram is different from the + * kernel physical address start). + */ + kernel_map.va_pa_offset = IS_ENABLED(CONFIG_64BIT) ? + 0UL : PAGE_OFFSET - kernel_map.phys_addr; kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; - phys_ram_base = kernel_map.phys_addr; - /* * The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit * kernel, whereas for 64-bit kernel, the end of the virtual address @@ -1106,6 +1132,17 @@ static void __init create_linear_mapping_page_table(void) phys_addr_t start, end; u64 i; +#ifdef CONFIG_STRICT_KERNEL_RWX + phys_addr_t ktext_start = __pa_symbol(_start); + phys_addr_t ktext_size = __init_data_begin - _start; + phys_addr_t krodata_start = __pa_symbol(__start_rodata); + phys_addr_t krodata_size = _data - __start_rodata; + + /* Isolate kernel text and rodata so they don't get mapped with a PUD */ + memblock_mark_nomap(ktext_start, ktext_size); + memblock_mark_nomap(krodata_start, krodata_size); +#endif + /* Map all memory banks in the linear mapping */ for_each_mem_range(i, &start, &end) { if (start >= end) @@ -1118,6 +1155,15 @@ static void __init create_linear_mapping_page_table(void) create_linear_mapping_range(start, end); } + +#ifdef CONFIG_STRICT_KERNEL_RWX + create_linear_mapping_range(ktext_start, ktext_start + ktext_size); + create_linear_mapping_range(krodata_start, + krodata_start + krodata_size); + + memblock_clear_nomap(ktext_start, ktext_size); + memblock_clear_nomap(krodata_start, krodata_size); +#endif } static void __init setup_vm_final(void) diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c index 9b18bda74154..18706f457da7 100644 --- a/arch/riscv/mm/physaddr.c +++ b/arch/riscv/mm/physaddr.c @@ -33,3 +33,19 @@ phys_addr_t __phys_addr_symbol(unsigned long x) return __va_to_pa_nodebug(x); } EXPORT_SYMBOL(__phys_addr_symbol); + +phys_addr_t linear_mapping_va_to_pa(unsigned long x) +{ + BUG_ON(!kernel_map.va_pa_offset); + + return ((unsigned long)(x) - kernel_map.va_pa_offset); +} +EXPORT_SYMBOL(linear_mapping_va_to_pa); + +void *linear_mapping_pa_to_va(unsigned long x) +{ + BUG_ON(!kernel_map.va_pa_offset); + + return ((void *)((unsigned long)(x) + kernel_map.va_pa_offset)); +} +EXPORT_SYMBOL(linear_mapping_pa_to_va); diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index d1a68b6d03b3..d14735a81301 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -887,12 +887,13 @@ const void * __init of_flat_dt_match_machine(const void *default_match, static void __early_init_dt_declare_initrd(unsigned long start, unsigned long end) { - /* ARM64 would cause a BUG to occur here when CONFIG_DEBUG_VM is - * enabled since __va() is called too early. ARM64 does make use - * of phys_initrd_start/phys_initrd_size so we can skip this - * conversion. + /* + * __va() is not yet available this early on some platforms. In that + * case, the platform uses phys_initrd_start/phys_initrd_size instead + * and does the VA conversion itself. */ - if (!IS_ENABLED(CONFIG_ARM64)) { + if (!IS_ENABLED(CONFIG_ARM64) && + !(IS_ENABLED(CONFIG_RISCV) && IS_ENABLED(CONFIG_64BIT))) { initrd_start = (unsigned long)__va(start); initrd_end = (unsigned long)__va(end); initrd_below_start_ok = 1; -- cgit v1.2.3 From cd0334e1c091fc0263ccf76e81d333a2ece4f3ab Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 3 Feb 2023 08:52:27 +0100 Subject: riscv: Split early and final KASAN population functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a preliminary work that allows to make the code more understandable. Signed-off-by: Alexandre Ghiti Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20230203075232.274282-2-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 185 ++++++++++++++++++++++++++++----------------- 1 file changed, 116 insertions(+), 69 deletions(-) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index e1226709490f..2a48eba6bd08 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -95,23 +95,13 @@ static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned } static void __init kasan_populate_pud(pgd_t *pgd, - unsigned long vaddr, unsigned long end, - bool early) + unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; pud_t *pudp, *base_pud; unsigned long next; - if (early) { - /* - * We can't use pgd_page_vaddr here as it would return a linear - * mapping address but it is not mapped yet, but when populating - * early_pg_dir, we need the physical address and when populating - * swapper_pg_dir, we need the kernel virtual address so use - * pt_ops facility. - */ - base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd))); - } else if (pgd_none(*pgd)) { + if (pgd_none(*pgd)) { base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); memcpy(base_pud, (void *)kasan_early_shadow_pud, sizeof(pud_t) * PTRS_PER_PUD); @@ -130,16 +120,10 @@ static void __init kasan_populate_pud(pgd_t *pgd, next = pud_addr_end(vaddr, end); if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { - if (early) { - phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd)); - set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE)); + phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); + if (phys_addr) { + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); continue; - } else { - phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); - if (phys_addr) { - set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); - continue; - } } } @@ -152,34 +136,21 @@ static void __init kasan_populate_pud(pgd_t *pgd, * it entirely, memblock could allocate a page at a physical address * where KASAN is not populated yet and then we'd get a page fault. */ - if (!early) - set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE)); + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE)); } static void __init kasan_populate_p4d(pgd_t *pgd, - unsigned long vaddr, unsigned long end, - bool early) + unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; p4d_t *p4dp, *base_p4d; unsigned long next; - if (early) { - /* - * We can't use pgd_page_vaddr here as it would return a linear - * mapping address but it is not mapped yet, but when populating - * early_pg_dir, we need the physical address and when populating - * swapper_pg_dir, we need the kernel virtual address so use - * pt_ops facility. - */ - base_p4d = pt_ops.get_p4d_virt(pfn_to_phys(_pgd_pfn(*pgd))); - } else { - base_p4d = (p4d_t *)pgd_page_vaddr(*pgd); - if (base_p4d == lm_alias(kasan_early_shadow_p4d)) { - base_p4d = memblock_alloc(PTRS_PER_PUD * sizeof(p4d_t), PAGE_SIZE); - memcpy(base_p4d, (void *)kasan_early_shadow_p4d, - sizeof(p4d_t) * PTRS_PER_P4D); - } + base_p4d = (p4d_t *)pgd_page_vaddr(*pgd); + if (base_p4d == lm_alias(kasan_early_shadow_p4d)) { + base_p4d = memblock_alloc(PTRS_PER_PUD * sizeof(p4d_t), PAGE_SIZE); + memcpy(base_p4d, (void *)kasan_early_shadow_p4d, + sizeof(p4d_t) * PTRS_PER_P4D); } p4dp = base_p4d + p4d_index(vaddr); @@ -188,20 +159,14 @@ static void __init kasan_populate_p4d(pgd_t *pgd, next = p4d_addr_end(vaddr, end); if (p4d_none(*p4dp) && IS_ALIGNED(vaddr, P4D_SIZE) && (next - vaddr) >= P4D_SIZE) { - if (early) { - phys_addr = __pa(((uintptr_t)kasan_early_shadow_pud)); - set_p4d(p4dp, pfn_p4d(PFN_DOWN(phys_addr), PAGE_TABLE)); + phys_addr = memblock_phys_alloc(P4D_SIZE, P4D_SIZE); + if (phys_addr) { + set_p4d(p4dp, pfn_p4d(PFN_DOWN(phys_addr), PAGE_KERNEL)); continue; - } else { - phys_addr = memblock_phys_alloc(P4D_SIZE, P4D_SIZE); - if (phys_addr) { - set_p4d(p4dp, pfn_p4d(PFN_DOWN(phys_addr), PAGE_KERNEL)); - continue; - } } } - kasan_populate_pud((pgd_t *)p4dp, vaddr, next, early); + kasan_populate_pud((pgd_t *)p4dp, vaddr, next); } while (p4dp++, vaddr = next, vaddr != end); /* @@ -210,8 +175,7 @@ static void __init kasan_populate_p4d(pgd_t *pgd, * it entirely, memblock could allocate a page at a physical address * where KASAN is not populated yet and then we'd get a page fault. */ - if (!early) - set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_p4d)), PAGE_TABLE)); + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_p4d)), PAGE_TABLE)); } #define kasan_early_shadow_pgd_next (pgtable_l5_enabled ? \ @@ -219,16 +183,15 @@ static void __init kasan_populate_p4d(pgd_t *pgd, (pgtable_l4_enabled ? \ (uintptr_t)kasan_early_shadow_pud : \ (uintptr_t)kasan_early_shadow_pmd)) -#define kasan_populate_pgd_next(pgdp, vaddr, next, early) \ +#define kasan_populate_pgd_next(pgdp, vaddr, next) \ (pgtable_l5_enabled ? \ - kasan_populate_p4d(pgdp, vaddr, next, early) : \ + kasan_populate_p4d(pgdp, vaddr, next) : \ (pgtable_l4_enabled ? \ - kasan_populate_pud(pgdp, vaddr, next, early) : \ + kasan_populate_pud(pgdp, vaddr, next) : \ kasan_populate_pmd((pud_t *)pgdp, vaddr, next))) static void __init kasan_populate_pgd(pgd_t *pgdp, - unsigned long vaddr, unsigned long end, - bool early) + unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; unsigned long next; @@ -237,11 +200,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp, next = pgd_addr_end(vaddr, end); if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) { - if (early) { - phys_addr = __pa((uintptr_t)kasan_early_shadow_pgd_next); - set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_TABLE)); - continue; - } else if (pgd_page_vaddr(*pgdp) == + if (pgd_page_vaddr(*pgdp) == (unsigned long)lm_alias(kasan_early_shadow_pgd_next)) { /* * pgdp can't be none since kasan_early_init @@ -258,7 +217,95 @@ static void __init kasan_populate_pgd(pgd_t *pgdp, } } - kasan_populate_pgd_next(pgdp, vaddr, next, early); + kasan_populate_pgd_next(pgdp, vaddr, next); + } while (pgdp++, vaddr = next, vaddr != end); +} + +static void __init kasan_early_populate_pud(p4d_t *p4dp, + unsigned long vaddr, + unsigned long end) +{ + pud_t *pudp, *base_pud; + phys_addr_t phys_addr; + unsigned long next; + + if (!pgtable_l4_enabled) { + pudp = (pud_t *)p4dp; + } else { + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_p4d_pfn(*p4dp))); + pudp = base_pud + pud_index(vaddr); + } + + do { + next = pud_addr_end(vaddr, end); + + if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && + (next - vaddr) >= PUD_SIZE) { + phys_addr = __pa((uintptr_t)kasan_early_shadow_pmd); + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE)); + continue; + } + + BUG(); + } while (pudp++, vaddr = next, vaddr != end); +} + +static void __init kasan_early_populate_p4d(pgd_t *pgdp, + unsigned long vaddr, + unsigned long end) +{ + p4d_t *p4dp, *base_p4d; + phys_addr_t phys_addr; + unsigned long next; + + /* + * We can't use pgd_page_vaddr here as it would return a linear + * mapping address but it is not mapped yet, but when populating + * early_pg_dir, we need the physical address and when populating + * swapper_pg_dir, we need the kernel virtual address so use + * pt_ops facility. + * Note that this test is then completely equivalent to + * p4dp = p4d_offset(pgdp, vaddr) + */ + if (!pgtable_l5_enabled) { + p4dp = (p4d_t *)pgdp; + } else { + base_p4d = pt_ops.get_p4d_virt(pfn_to_phys(_pgd_pfn(*pgdp))); + p4dp = base_p4d + p4d_index(vaddr); + } + + do { + next = p4d_addr_end(vaddr, end); + + if (p4d_none(*p4dp) && IS_ALIGNED(vaddr, P4D_SIZE) && + (next - vaddr) >= P4D_SIZE) { + phys_addr = __pa((uintptr_t)kasan_early_shadow_pud); + set_p4d(p4dp, pfn_p4d(PFN_DOWN(phys_addr), PAGE_TABLE)); + continue; + } + + kasan_early_populate_pud(p4dp, vaddr, next); + } while (p4dp++, vaddr = next, vaddr != end); +} + +static void __init kasan_early_populate_pgd(pgd_t *pgdp, + unsigned long vaddr, + unsigned long end) +{ + phys_addr_t phys_addr; + unsigned long next; + + do { + next = pgd_addr_end(vaddr, end); + + if (pgd_none(*pgdp) && IS_ALIGNED(vaddr, PGDIR_SIZE) && + (next - vaddr) >= PGDIR_SIZE) { + phys_addr = __pa((uintptr_t)kasan_early_shadow_p4d); + set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_TABLE)); + continue; + } + + kasan_early_populate_p4d(pgdp, vaddr, next); } while (pgdp++, vaddr = next, vaddr != end); } @@ -295,16 +342,16 @@ asmlinkage void __init kasan_early_init(void) PAGE_TABLE)); } - kasan_populate_pgd(early_pg_dir + pgd_index(KASAN_SHADOW_START), - KASAN_SHADOW_START, KASAN_SHADOW_END, true); + kasan_early_populate_pgd(early_pg_dir + pgd_index(KASAN_SHADOW_START), + KASAN_SHADOW_START, KASAN_SHADOW_END); local_flush_tlb_all(); } void __init kasan_swapper_init(void) { - kasan_populate_pgd(pgd_offset_k(KASAN_SHADOW_START), - KASAN_SHADOW_START, KASAN_SHADOW_END, true); + kasan_early_populate_pgd(pgd_offset_k(KASAN_SHADOW_START), + KASAN_SHADOW_START, KASAN_SHADOW_END); local_flush_tlb_all(); } @@ -314,7 +361,7 @@ static void __init kasan_populate(void *start, void *end) unsigned long vaddr = (unsigned long)start & PAGE_MASK; unsigned long vend = PAGE_ALIGN((unsigned long)end); - kasan_populate_pgd(pgd_offset_k(vaddr), vaddr, vend, false); + kasan_populate_pgd(pgd_offset_k(vaddr), vaddr, vend); local_flush_tlb_all(); memset(start, KASAN_SHADOW_INIT, end - start); -- cgit v1.2.3 From 96f9d4daf745205fe869e9e5ac23199ef11c5448 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 3 Feb 2023 08:52:28 +0100 Subject: riscv: Rework kasan population functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our previous kasan population implementation used to have the final kasan shadow region mapped with kasan_early_shadow_page, because we did not clean the early mapping and then we had to populate the kasan region "in-place" which made the code cumbersome. So now we clear the early mapping, establish a temporary mapping while we populate the kasan shadow region with just the kernel regions that will be used. This new version uses the "generic" way of going through a page table that may be folded at runtime (avoid the XXX_next macros). It was tested with outline instrumentation on an Ubuntu kernel configuration successfully. Signed-off-by: Alexandre Ghiti Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20230203075232.274282-3-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 361 +++++++++++++++++++++++---------------------- 1 file changed, 183 insertions(+), 178 deletions(-) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 2a48eba6bd08..8fc0efcf905c 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -18,58 +18,48 @@ * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate * the page global directory with kasan_early_shadow_pmd. * - * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping - * must be divided as follows: - * - the first PGD entry, although incomplete, is populated with - * kasan_early_shadow_pud/p4d - * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d - * - the last PGD entry is shared with the kernel mapping so populated at the - * lower levels pud/p4d - * - * In addition, when shallow populating a kasan region (for example vmalloc), - * this region may also not be aligned on PGDIR size, so we must go down to the - * pud level too. + * For sv48 and sv57, the region start is aligned on PGDIR_SIZE whereas the end + * region is not and then we have to go down to the PUD level. */ extern pgd_t early_pg_dir[PTRS_PER_PGD]; +pgd_t tmp_pg_dir[PTRS_PER_PGD] __page_aligned_bss; +p4d_t tmp_p4d[PTRS_PER_P4D] __page_aligned_bss; +pud_t tmp_pud[PTRS_PER_PUD] __page_aligned_bss; static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; - pte_t *ptep, *base_pte; + pte_t *ptep, *p; - if (pmd_none(*pmd)) - base_pte = memblock_alloc(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE); - else - base_pte = (pte_t *)pmd_page_vaddr(*pmd); + if (pmd_none(*pmd)) { + p = memblock_alloc(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE); + set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(p)), PAGE_TABLE)); + } - ptep = base_pte + pte_index(vaddr); + ptep = pte_offset_kernel(pmd, vaddr); do { if (pte_none(*ptep)) { phys_addr = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); set_pte(ptep, pfn_pte(PFN_DOWN(phys_addr), PAGE_KERNEL)); + memset(__va(phys_addr), KASAN_SHADOW_INIT, PAGE_SIZE); } } while (ptep++, vaddr += PAGE_SIZE, vaddr != end); - - set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE)); } static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; - pmd_t *pmdp, *base_pmd; + pmd_t *pmdp, *p; unsigned long next; if (pud_none(*pud)) { - base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); - } else { - base_pmd = (pmd_t *)pud_pgtable(*pud); - if (base_pmd == lm_alias(kasan_early_shadow_pmd)) - base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + p = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + set_pud(pud, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE)); } - pmdp = base_pmd + pmd_index(vaddr); + pmdp = pmd_offset(pud, vaddr); do { next = pmd_addr_end(vaddr, end); @@ -78,43 +68,28 @@ static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned phys_addr = memblock_phys_alloc(PMD_SIZE, PMD_SIZE); if (phys_addr) { set_pmd(pmdp, pfn_pmd(PFN_DOWN(phys_addr), PAGE_KERNEL)); + memset(__va(phys_addr), KASAN_SHADOW_INIT, PMD_SIZE); continue; } } kasan_populate_pte(pmdp, vaddr, next); } while (pmdp++, vaddr = next, vaddr != end); - - /* - * Wait for the whole PGD to be populated before setting the PGD in - * the page table, otherwise, if we did set the PGD before populating - * it entirely, memblock could allocate a page at a physical address - * where KASAN is not populated yet and then we'd get a page fault. - */ - set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); } -static void __init kasan_populate_pud(pgd_t *pgd, +static void __init kasan_populate_pud(p4d_t *p4d, unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; - pud_t *pudp, *base_pud; + pud_t *pudp, *p; unsigned long next; - if (pgd_none(*pgd)) { - base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); - memcpy(base_pud, (void *)kasan_early_shadow_pud, - sizeof(pud_t) * PTRS_PER_PUD); - } else { - base_pud = (pud_t *)pgd_page_vaddr(*pgd); - if (base_pud == lm_alias(kasan_early_shadow_pud)) { - base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); - memcpy(base_pud, (void *)kasan_early_shadow_pud, - sizeof(pud_t) * PTRS_PER_PUD); - } + if (p4d_none(*p4d)) { + p = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); + set_p4d(p4d, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE)); } - pudp = base_pud + pud_index(vaddr); + pudp = pud_offset(p4d, vaddr); do { next = pud_addr_end(vaddr, end); @@ -123,37 +98,28 @@ static void __init kasan_populate_pud(pgd_t *pgd, phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); if (phys_addr) { set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); + memset(__va(phys_addr), KASAN_SHADOW_INIT, PUD_SIZE); continue; } } kasan_populate_pmd(pudp, vaddr, next); } while (pudp++, vaddr = next, vaddr != end); - - /* - * Wait for the whole PGD to be populated before setting the PGD in - * the page table, otherwise, if we did set the PGD before populating - * it entirely, memblock could allocate a page at a physical address - * where KASAN is not populated yet and then we'd get a page fault. - */ - set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE)); } static void __init kasan_populate_p4d(pgd_t *pgd, unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; - p4d_t *p4dp, *base_p4d; + p4d_t *p4dp, *p; unsigned long next; - base_p4d = (p4d_t *)pgd_page_vaddr(*pgd); - if (base_p4d == lm_alias(kasan_early_shadow_p4d)) { - base_p4d = memblock_alloc(PTRS_PER_PUD * sizeof(p4d_t), PAGE_SIZE); - memcpy(base_p4d, (void *)kasan_early_shadow_p4d, - sizeof(p4d_t) * PTRS_PER_P4D); + if (pgd_none(*pgd)) { + p = memblock_alloc(PTRS_PER_P4D * sizeof(p4d_t), PAGE_SIZE); + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } - p4dp = base_p4d + p4d_index(vaddr); + p4dp = p4d_offset(pgd, vaddr); do { next = p4d_addr_end(vaddr, end); @@ -162,34 +128,15 @@ static void __init kasan_populate_p4d(pgd_t *pgd, phys_addr = memblock_phys_alloc(P4D_SIZE, P4D_SIZE); if (phys_addr) { set_p4d(p4dp, pfn_p4d(PFN_DOWN(phys_addr), PAGE_KERNEL)); + memset(__va(phys_addr), KASAN_SHADOW_INIT, P4D_SIZE); continue; } } - kasan_populate_pud((pgd_t *)p4dp, vaddr, next); + kasan_populate_pud(p4dp, vaddr, next); } while (p4dp++, vaddr = next, vaddr != end); - - /* - * Wait for the whole P4D to be populated before setting the P4D in - * the page table, otherwise, if we did set the P4D before populating - * it entirely, memblock could allocate a page at a physical address - * where KASAN is not populated yet and then we'd get a page fault. - */ - set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_p4d)), PAGE_TABLE)); } -#define kasan_early_shadow_pgd_next (pgtable_l5_enabled ? \ - (uintptr_t)kasan_early_shadow_p4d : \ - (pgtable_l4_enabled ? \ - (uintptr_t)kasan_early_shadow_pud : \ - (uintptr_t)kasan_early_shadow_pmd)) -#define kasan_populate_pgd_next(pgdp, vaddr, next) \ - (pgtable_l5_enabled ? \ - kasan_populate_p4d(pgdp, vaddr, next) : \ - (pgtable_l4_enabled ? \ - kasan_populate_pud(pgdp, vaddr, next) : \ - kasan_populate_pmd((pud_t *)pgdp, vaddr, next))) - static void __init kasan_populate_pgd(pgd_t *pgdp, unsigned long vaddr, unsigned long end) { @@ -199,25 +146,86 @@ static void __init kasan_populate_pgd(pgd_t *pgdp, do { next = pgd_addr_end(vaddr, end); - if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) { - if (pgd_page_vaddr(*pgdp) == - (unsigned long)lm_alias(kasan_early_shadow_pgd_next)) { - /* - * pgdp can't be none since kasan_early_init - * initialized all KASAN shadow region with - * kasan_early_shadow_pud: if this is still the - * case, that means we can try to allocate a - * hugepage as a replacement. - */ - phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE); - if (phys_addr) { - set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_KERNEL)); - continue; - } + if (pgd_none(*pgdp) && IS_ALIGNED(vaddr, PGDIR_SIZE) && + (next - vaddr) >= PGDIR_SIZE) { + phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE); + if (phys_addr) { + set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_KERNEL)); + memset(__va(phys_addr), KASAN_SHADOW_INIT, PGDIR_SIZE); + continue; } } - kasan_populate_pgd_next(pgdp, vaddr, next); + kasan_populate_p4d(pgdp, vaddr, next); + } while (pgdp++, vaddr = next, vaddr != end); +} + +static void __init kasan_early_clear_pud(p4d_t *p4dp, + unsigned long vaddr, unsigned long end) +{ + pud_t *pudp, *base_pud; + unsigned long next; + + if (!pgtable_l4_enabled) { + pudp = (pud_t *)p4dp; + } else { + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_p4d_pfn(*p4dp))); + pudp = base_pud + pud_index(vaddr); + } + + do { + next = pud_addr_end(vaddr, end); + + if (IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { + pud_clear(pudp); + continue; + } + + BUG(); + } while (pudp++, vaddr = next, vaddr != end); +} + +static void __init kasan_early_clear_p4d(pgd_t *pgdp, + unsigned long vaddr, unsigned long end) +{ + p4d_t *p4dp, *base_p4d; + unsigned long next; + + if (!pgtable_l5_enabled) { + p4dp = (p4d_t *)pgdp; + } else { + base_p4d = pt_ops.get_p4d_virt(pfn_to_phys(_pgd_pfn(*pgdp))); + p4dp = base_p4d + p4d_index(vaddr); + } + + do { + next = p4d_addr_end(vaddr, end); + + if (pgtable_l4_enabled && IS_ALIGNED(vaddr, P4D_SIZE) && + (next - vaddr) >= P4D_SIZE) { + p4d_clear(p4dp); + continue; + } + + kasan_early_clear_pud(p4dp, vaddr, next); + } while (p4dp++, vaddr = next, vaddr != end); +} + +static void __init kasan_early_clear_pgd(pgd_t *pgdp, + unsigned long vaddr, unsigned long end) +{ + unsigned long next; + + do { + next = pgd_addr_end(vaddr, end); + + if (pgtable_l5_enabled && IS_ALIGNED(vaddr, PGDIR_SIZE) && + (next - vaddr) >= PGDIR_SIZE) { + pgd_clear(pgdp); + continue; + } + + kasan_early_clear_p4d(pgdp, vaddr, next); } while (pgdp++, vaddr = next, vaddr != end); } @@ -362,117 +370,64 @@ static void __init kasan_populate(void *start, void *end) unsigned long vend = PAGE_ALIGN((unsigned long)end); kasan_populate_pgd(pgd_offset_k(vaddr), vaddr, vend); - - local_flush_tlb_all(); - memset(start, KASAN_SHADOW_INIT, end - start); -} - -static void __init kasan_shallow_populate_pmd(pgd_t *pgdp, - unsigned long vaddr, unsigned long end) -{ - unsigned long next; - pmd_t *pmdp, *base_pmd; - bool is_kasan_pte; - - base_pmd = (pmd_t *)pgd_page_vaddr(*pgdp); - pmdp = base_pmd + pmd_index(vaddr); - - do { - next = pmd_addr_end(vaddr, end); - is_kasan_pte = (pmd_pgtable(*pmdp) == lm_alias(kasan_early_shadow_pte)); - - if (is_kasan_pte) - pmd_clear(pmdp); - } while (pmdp++, vaddr = next, vaddr != end); } -static void __init kasan_shallow_populate_pud(pgd_t *pgdp, +static void __init kasan_shallow_populate_pud(p4d_t *p4d, unsigned long vaddr, unsigned long end) { unsigned long next; - pud_t *pudp, *base_pud; - pmd_t *base_pmd; - bool is_kasan_pmd; - - base_pud = (pud_t *)pgd_page_vaddr(*pgdp); - pudp = base_pud + pud_index(vaddr); + void *p; + pud_t *pud_k = pud_offset(p4d, vaddr); do { next = pud_addr_end(vaddr, end); - is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd)); - if (!is_kasan_pmd) - continue; - - base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); - - if (IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) + if (pud_none(*pud_k)) { + p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + set_pud(pud_k, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; + } - memcpy(base_pmd, (void *)kasan_early_shadow_pmd, PAGE_SIZE); - kasan_shallow_populate_pmd((pgd_t *)pudp, vaddr, next); - } while (pudp++, vaddr = next, vaddr != end); + BUG(); + } while (pud_k++, vaddr = next, vaddr != end); } -static void __init kasan_shallow_populate_p4d(pgd_t *pgdp, +static void __init kasan_shallow_populate_p4d(pgd_t *pgd, unsigned long vaddr, unsigned long end) { unsigned long next; - p4d_t *p4dp, *base_p4d; - pud_t *base_pud; - bool is_kasan_pud; - - base_p4d = (p4d_t *)pgd_page_vaddr(*pgdp); - p4dp = base_p4d + p4d_index(vaddr); + void *p; + p4d_t *p4d_k = p4d_offset(pgd, vaddr); do { next = p4d_addr_end(vaddr, end); - is_kasan_pud = (p4d_pgtable(*p4dp) == lm_alias(kasan_early_shadow_pud)); - - if (!is_kasan_pud) - continue; - - base_pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - set_p4d(p4dp, pfn_p4d(PFN_DOWN(__pa(base_pud)), PAGE_TABLE)); - if (IS_ALIGNED(vaddr, P4D_SIZE) && (next - vaddr) >= P4D_SIZE) + if (p4d_none(*p4d_k)) { + p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + set_p4d(p4d_k, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; + } - memcpy(base_pud, (void *)kasan_early_shadow_pud, PAGE_SIZE); - kasan_shallow_populate_pud((pgd_t *)p4dp, vaddr, next); - } while (p4dp++, vaddr = next, vaddr != end); + kasan_shallow_populate_pud(p4d_k, vaddr, end); + } while (p4d_k++, vaddr = next, vaddr != end); } -#define kasan_shallow_populate_pgd_next(pgdp, vaddr, next) \ - (pgtable_l5_enabled ? \ - kasan_shallow_populate_p4d(pgdp, vaddr, next) : \ - (pgtable_l4_enabled ? \ - kasan_shallow_populate_pud(pgdp, vaddr, next) : \ - kasan_shallow_populate_pmd(pgdp, vaddr, next))) - static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end) { unsigned long next; void *p; pgd_t *pgd_k = pgd_offset_k(vaddr); - bool is_kasan_pgd_next; do { next = pgd_addr_end(vaddr, end); - is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) == - (unsigned long)lm_alias(kasan_early_shadow_pgd_next)); - if (is_kasan_pgd_next) { + if (pgd_none(*pgd_k)) { p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); - } - - if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) continue; + } - memcpy(p, (void *)kasan_early_shadow_pgd_next, PAGE_SIZE); - kasan_shallow_populate_pgd_next(pgd_k, vaddr, next); + kasan_shallow_populate_p4d(pgd_k, vaddr, next); } while (pgd_k++, vaddr = next, vaddr != end); } @@ -482,7 +437,37 @@ static void __init kasan_shallow_populate(void *start, void *end) unsigned long vend = PAGE_ALIGN((unsigned long)end); kasan_shallow_populate_pgd(vaddr, vend); - local_flush_tlb_all(); +} + +static void create_tmp_mapping(void) +{ + void *ptr; + p4d_t *base_p4d; + + /* + * We need to clean the early mapping: this is hard to achieve "in-place", + * so install a temporary mapping like arm64 and x86 do. + */ + memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(pgd_t) * PTRS_PER_PGD); + + /* Copy the last p4d since it is shared with the kernel mapping. */ + if (pgtable_l5_enabled) { + ptr = (p4d_t *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); + memcpy(tmp_p4d, ptr, sizeof(p4d_t) * PTRS_PER_P4D); + set_pgd(&tmp_pg_dir[pgd_index(KASAN_SHADOW_END)], + pfn_pgd(PFN_DOWN(__pa(tmp_p4d)), PAGE_TABLE)); + base_p4d = tmp_p4d; + } else { + base_p4d = (p4d_t *)tmp_pg_dir; + } + + /* Copy the last pud since it is shared with the kernel mapping. */ + if (pgtable_l4_enabled) { + ptr = (pud_t *)p4d_page_vaddr(*(base_p4d + p4d_index(KASAN_SHADOW_END))); + memcpy(tmp_pud, ptr, sizeof(pud_t) * PTRS_PER_PUD); + set_p4d(&base_p4d[p4d_index(KASAN_SHADOW_END)], + pfn_p4d(PFN_DOWN(__pa(tmp_pud)), PAGE_TABLE)); + } } void __init kasan_init(void) @@ -490,10 +475,27 @@ void __init kasan_init(void) phys_addr_t p_start, p_end; u64 i; - if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + create_tmp_mapping(); + csr_write(CSR_SATP, PFN_DOWN(__pa(tmp_pg_dir)) | satp_mode); + + kasan_early_clear_pgd(pgd_offset_k(KASAN_SHADOW_START), + KASAN_SHADOW_START, KASAN_SHADOW_END); + + kasan_populate_early_shadow((void *)kasan_mem_to_shadow((void *)FIXADDR_START), + (void *)kasan_mem_to_shadow((void *)VMALLOC_START)); + + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { kasan_shallow_populate( (void *)kasan_mem_to_shadow((void *)VMALLOC_START), (void *)kasan_mem_to_shadow((void *)VMALLOC_END)); + /* Shallow populate modules and BPF which are vmalloc-allocated */ + kasan_shallow_populate( + (void *)kasan_mem_to_shadow((void *)MODULES_VADDR), + (void *)kasan_mem_to_shadow((void *)MODULES_END)); + } else { + kasan_populate_early_shadow((void *)kasan_mem_to_shadow((void *)VMALLOC_START), + (void *)kasan_mem_to_shadow((void *)VMALLOC_END)); + } /* Populate the linear mapping */ for_each_mem_range(i, &p_start, &p_end) { @@ -506,8 +508,8 @@ void __init kasan_init(void) kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end)); } - /* Populate kernel, BPF, modules mapping */ - kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR), + /* Populate kernel */ + kasan_populate(kasan_mem_to_shadow((const void *)MODULES_END), kasan_mem_to_shadow((const void *)MODULES_VADDR + SZ_2G)); for (i = 0; i < PTRS_PER_PTE; i++) @@ -518,4 +520,7 @@ void __init kasan_init(void) memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE); init_task.kasan_depth = 0; + + csr_write(CSR_SATP, PFN_DOWN(__pa(swapper_pg_dir)) | satp_mode); + local_flush_tlb_all(); } -- cgit v1.2.3 From 401e84488800d05e8ed6db2a687eaa94415f4ec8 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 3 Feb 2023 08:52:29 +0100 Subject: riscv: Move DTB_EARLY_BASE_VA to the kernel address space MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The early virtual address should lie in the kernel address space for inline kasan instrumentation to succeed, otherwise kasan tries to dereference an address that does not exist in the address space (since kasan only maps *kernel* address space, not the userspace). Simply use the very first address of the kernel address space for the early fdt mapping. It allowed an Ubuntu kernel to boot successfully with inline instrumentation. Signed-off-by: Alexandre Ghiti Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20230203075232.274282-4-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 478d6763a01a..87f6a5d475a6 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -57,7 +57,7 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] EXPORT_SYMBOL(empty_zero_page); extern char _start[]; -#define DTB_EARLY_BASE_VA PGDIR_SIZE +#define DTB_EARLY_BASE_VA (ADDRESS_SPACE_END - (PTRS_PER_PGD / 2 * PGDIR_SIZE) + 1) void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; -- cgit v1.2.3 From 617955ca6e275c4dd0dcf5316fca7fc04a8f2fe6 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 3 Feb 2023 08:52:30 +0100 Subject: riscv: Fix EFI stub usage of KASAN instrumented strcmp function The EFI stub must not use any KASAN instrumented code as the kernel proper did not initialize the thread pointer and the mapping for the KASAN shadow region. Avoid using the generic strcmp function, instead use the one in drivers/firmware/efi/libstub/string.c. Signed-off-by: Alexandre Ghiti Acked-by: Ard Biesheuvel Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20230203075232.274282-5-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/image-vars.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/riscv/kernel/image-vars.h b/arch/riscv/kernel/image-vars.h index 7e2962ef73f9..15616155008c 100644 --- a/arch/riscv/kernel/image-vars.h +++ b/arch/riscv/kernel/image-vars.h @@ -23,8 +23,6 @@ * linked at. The routines below are all implemented in assembler in a * position independent manner */ -__efistub_strcmp = strcmp; - __efistub__start = _start; __efistub__start_kernel = _start_kernel; __efistub__end = _end; -- cgit v1.2.3 From ecd7ebaf0b5a094a6180b299a5635c0eea42be4b Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 3 Feb 2023 08:52:31 +0100 Subject: riscv: Fix ptdump when KASAN is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The KASAN shadow region was moved next to the kernel mapping but the ptdump code was not updated and it appears to break the dump of the kernel page table, so fix this by moving the KASAN shadow region in ptdump. Fixes: f7ae02333d13 ("riscv: Move KASAN mapping next to the kernel mapping") Signed-off-by: Alexandre Ghiti Tested-by: Björn Töpel Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20230203075232.274282-6-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/ptdump.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 830e7de65e3a..20a9f991a6d7 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -59,10 +59,6 @@ struct ptd_mm_info { }; enum address_markers_idx { -#ifdef CONFIG_KASAN - KASAN_SHADOW_START_NR, - KASAN_SHADOW_END_NR, -#endif FIXMAP_START_NR, FIXMAP_END_NR, PCI_IO_START_NR, @@ -74,6 +70,10 @@ enum address_markers_idx { VMALLOC_START_NR, VMALLOC_END_NR, PAGE_OFFSET_NR, +#ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, +#endif #ifdef CONFIG_64BIT MODULES_MAPPING_NR, KERNEL_MAPPING_NR, @@ -82,10 +82,6 @@ enum address_markers_idx { }; static struct addr_marker address_markers[] = { -#ifdef CONFIG_KASAN - {0, "Kasan shadow start"}, - {0, "Kasan shadow end"}, -#endif {0, "Fixmap start"}, {0, "Fixmap end"}, {0, "PCI I/O start"}, @@ -97,6 +93,10 @@ static struct addr_marker address_markers[] = { {0, "vmalloc() area"}, {0, "vmalloc() end"}, {0, "Linear mapping"}, +#ifdef CONFIG_KASAN + {0, "Kasan shadow start"}, + {0, "Kasan shadow end"}, +#endif #ifdef CONFIG_64BIT {0, "Modules/BPF mapping"}, {0, "Kernel mapping"}, @@ -362,10 +362,6 @@ static int __init ptdump_init(void) { unsigned int i, j; -#ifdef CONFIG_KASAN - address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; - address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; -#endif address_markers[FIXMAP_START_NR].start_address = FIXADDR_START; address_markers[FIXMAP_END_NR].start_address = FIXADDR_TOP; address_markers[PCI_IO_START_NR].start_address = PCI_IO_START; @@ -377,6 +373,10 @@ static int __init ptdump_init(void) address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; address_markers[PAGE_OFFSET_NR].start_address = PAGE_OFFSET; +#ifdef CONFIG_KASAN + address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; + address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; +#endif #ifdef CONFIG_64BIT address_markers[MODULES_MAPPING_NR].start_address = MODULES_VADDR; address_markers[KERNEL_MAPPING_NR].start_address = kernel_map.virt_addr; -- cgit v1.2.3 From 864046c512c2cd8418dc928b91981fb12a80396c Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 3 Feb 2023 08:52:32 +0100 Subject: riscv: Unconditionnally select KASAN_VMALLOC if KASAN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If KASAN is enabled, VMAP_STACK depends on KASAN_VMALLOC so enable KASAN_VMALLOC with KASAN so that we can enable VMAP_STACK by default. Signed-off-by: Alexandre Ghiti Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20230203075232.274282-7-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e2b656043abf..0f226d3261ca 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -117,6 +117,7 @@ config RISCV select HAVE_RSEQ select IRQ_DOMAIN select IRQ_FORCED_THREADING + select KASAN_VMALLOC if KASAN select MODULES_USE_ELF_RELA if MODULES select MODULE_SECTIONS if MODULES select OF -- cgit v1.2.3 From 55de1e4ad43b375566162ae0cc2b56dfa44aae4e Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 29 Mar 2023 06:53:24 +0200 Subject: riscv: Prepare EFI header for relocatable kernels ld does not handle relocations correctly as explained here [1], a fix for that was proposed by Nelson there but we have to support older toolchains and then provide this fix. Note that llvm does not need this fix and is then excluded. [1] https://sourceware.org/pipermail/binutils/2023-March/126690.html Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230329045329.64565-2-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/set_memory.h | 3 +++ arch/riscv/kernel/efi-header.S | 19 ++++++++++++++++--- arch/riscv/kernel/vmlinux.lds.S | 5 ++--- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index a2c14d4b3993..ec11001c3fe0 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -56,4 +56,7 @@ bool kernel_page_present(struct page *page); #define SECTION_ALIGN L1_CACHE_BYTES #endif /* CONFIG_STRICT_KERNEL_RWX */ +#define PECOFF_SECTION_ALIGNMENT 0x1000 +#define PECOFF_FILE_ALIGNMENT 0x200 + #endif /* _ASM_RISCV_SET_MEMORY_H */ diff --git a/arch/riscv/kernel/efi-header.S b/arch/riscv/kernel/efi-header.S index 8e733aa48ba6..515b2dfbca75 100644 --- a/arch/riscv/kernel/efi-header.S +++ b/arch/riscv/kernel/efi-header.S @@ -6,6 +6,7 @@ #include #include +#include .macro __EFI_PE_HEADER .long PE_MAGIC @@ -33,7 +34,11 @@ optional_header: .byte 0x02 // MajorLinkerVersion .byte 0x14 // MinorLinkerVersion .long __pecoff_text_end - efi_header_end // SizeOfCode - .long __pecoff_data_virt_size // SizeOfInitializedData +#ifdef __clang__ + .long __pecoff_data_virt_size // SizeOfInitializedData +#else + .long __pecoff_data_virt_end - __pecoff_text_end // SizeOfInitializedData +#endif .long 0 // SizeOfUninitializedData .long __efistub_efi_pe_entry - _start // AddressOfEntryPoint .long efi_header_end - _start // BaseOfCode @@ -91,9 +96,17 @@ section_table: IMAGE_SCN_MEM_EXECUTE // Characteristics .ascii ".data\0\0\0" - .long __pecoff_data_virt_size // VirtualSize +#ifdef __clang__ + .long __pecoff_data_virt_size // VirtualSize +#else + .long __pecoff_data_virt_end - __pecoff_text_end // VirtualSize +#endif .long __pecoff_text_end - _start // VirtualAddress - .long __pecoff_data_raw_size // SizeOfRawData +#ifdef __clang__ + .long __pecoff_data_raw_size // SizeOfRawData +#else + .long __pecoff_data_raw_end - __pecoff_text_end // SizeOfRawData +#endif .long __pecoff_text_end - _start // PointerToRawData .long 0 // PointerToRelocations diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 53a8ad65b255..1c38294580c0 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -27,9 +27,6 @@ ENTRY(_start) jiffies = jiffies_64; -PECOFF_SECTION_ALIGNMENT = 0x1000; -PECOFF_FILE_ALIGNMENT = 0x200; - SECTIONS { /* Beginning of code and text segment */ @@ -132,6 +129,7 @@ SECTIONS #ifdef CONFIG_EFI .pecoff_edata_padding : { BYTE(0); . = ALIGN(PECOFF_FILE_ALIGNMENT); } __pecoff_data_raw_size = ABSOLUTE(. - __pecoff_text_end); + __pecoff_data_raw_end = ABSOLUTE(.); #endif /* End of data section */ @@ -142,6 +140,7 @@ SECTIONS #ifdef CONFIG_EFI . = ALIGN(PECOFF_SECTION_ALIGNMENT); __pecoff_data_virt_size = ABSOLUTE(. - __pecoff_text_end); + __pecoff_data_virt_end = ABSOLUTE(.); #endif _end = .; -- cgit v1.2.3 From 69a90d2fe107c8bf6a424af0f30d2b223cdeaf7c Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 29 Mar 2023 06:53:25 +0200 Subject: riscv: Move .rela.dyn outside of init to avoid empty relocations This is a preparatory patch for relocatable kernels: .rela.dyn should be in .init but doing so actually produces empty relocations, so this should be a temporary commit until we find a solution. This issue was reported here [1]. [1] https://lore.kernel.org/all/4a6fc7a3-9697-a49b-0941-97f32194b0d7@ghiti.fr/. Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230329045329.64565-3-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vmlinux.lds.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 1c38294580c0..e05e6df44225 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -96,10 +96,6 @@ SECTIONS *(.rel.dyn*) } - .rela.dyn : { - *(.rela*) - } - __init_data_end = .; . = ALIGN(8); @@ -126,6 +122,10 @@ SECTIONS *(.sdata*) } + .rela.dyn : { + *(.rela*) + } + #ifdef CONFIG_EFI .pecoff_edata_padding : { BYTE(0); . = ALIGN(PECOFF_FILE_ALIGNMENT); } __pecoff_data_raw_size = ABSOLUTE(. - __pecoff_text_end); -- cgit v1.2.3 From 39b33072941f8bab82aa2c802044062385a046bf Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 29 Mar 2023 06:53:26 +0200 Subject: riscv: Introduce CONFIG_RELOCATABLE This config allows to compile 64b kernel as PIE and to relocate it at any virtual address at runtime: this paves the way to KASLR. Runtime relocation is possible since relocation metadata are embedded into the kernel. Note that relocating at runtime introduces an overhead even if the kernel is loaded at the same address it was linked at and that the compiler options are those used in arm64 which uses the same RELA relocation format. Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230329045329.64565-4-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 14 +++++++++++ arch/riscv/Makefile | 7 ++++-- arch/riscv/kernel/vmlinux.lds.S | 17 +++++++++++-- arch/riscv/mm/Makefile | 4 +++ arch/riscv/mm/init.c | 54 ++++++++++++++++++++++++++++++++++++++++- 5 files changed, 91 insertions(+), 5 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c5e42cc37604..ccbb05118ca0 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -561,6 +561,20 @@ config COMPAT If you want to execute 32-bit userspace applications, say Y. +config RELOCATABLE + bool "Build a relocatable kernel" + depends on MMU && 64BIT && !XIP_KERNEL + help + This builds a kernel as a Position Independent Executable (PIE), + which retains all relocation metadata required to relocate the + kernel binary at runtime to a different virtual address than the + address it was linked at. + Since RISCV uses the RELA relocation format, this requires a + relocation pass at runtime even if the kernel is loaded at the + same address it was linked at. + + If unsure, say N. + endmenu # "Kernel features" menu "Boot options" diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 6203c3378922..860b09e409c7 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -7,9 +7,12 @@ # OBJCOPYFLAGS := -O binary -LDFLAGS_vmlinux := +ifeq ($(CONFIG_RELOCATABLE),y) + LDFLAGS_vmlinux += -shared -Bsymbolic -z notext -z norelro + KBUILD_CFLAGS += -fPIE +endif ifeq ($(CONFIG_DYNAMIC_FTRACE),y) - LDFLAGS_vmlinux := --no-relax + LDFLAGS_vmlinux += --no-relax KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY ifeq ($(CONFIG_RISCV_ISA_C),y) CC_FLAGS_FTRACE := -fpatchable-function-entry=4 diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index e05e6df44225..615ff5842690 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -122,10 +122,23 @@ SECTIONS *(.sdata*) } - .rela.dyn : { - *(.rela*) + .rela.dyn : ALIGN(8) { + __rela_dyn_start = .; + *(.rela .rela*) + __rela_dyn_end = .; } +#ifdef CONFIG_RELOCATABLE + .data.rel : { *(.data.rel*) } + .got : { *(.got*) } + .plt : { *(.plt) } + .dynamic : { *(.dynamic) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .hash : { *(.hash) } + .gnu.hash : { *(.gnu.hash) } +#endif + #ifdef CONFIG_EFI .pecoff_edata_padding : { BYTE(0); . = ALIGN(PECOFF_FILE_ALIGNMENT); } __pecoff_data_raw_size = ABSOLUTE(. - __pecoff_text_end); diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index 2ac177c05352..b85e9e82f082 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS_init.o := -mcmodel=medany +ifdef CONFIG_RELOCATABLE +CFLAGS_init.o += -fno-pie +endif + ifdef CONFIG_FTRACE CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_cacheflush.o = $(CC_FLAGS_FTRACE) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 478d6763a01a..029a844d4ad8 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -20,6 +20,9 @@ #include #include #include +#ifdef CONFIG_RELOCATABLE +#include +#endif #include #include @@ -146,7 +149,7 @@ static void __init print_vm_layout(void) print_ml("kasan", KASAN_SHADOW_START, KASAN_SHADOW_END); #endif - print_ml("kernel", (unsigned long)KERNEL_LINK_ADDR, + print_ml("kernel", (unsigned long)kernel_map.virt_addr, (unsigned long)ADDRESS_SPACE_END); } } @@ -820,6 +823,44 @@ retry: #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing." #endif +#ifdef CONFIG_RELOCATABLE +extern unsigned long __rela_dyn_start, __rela_dyn_end; + +static void __init relocate_kernel(void) +{ + Elf64_Rela *rela = (Elf64_Rela *)&__rela_dyn_start; + /* + * This holds the offset between the linked virtual address and the + * relocated virtual address. + */ + uintptr_t reloc_offset = kernel_map.virt_addr - KERNEL_LINK_ADDR; + /* + * This holds the offset between kernel linked virtual address and + * physical address. + */ + uintptr_t va_kernel_link_pa_offset = KERNEL_LINK_ADDR - kernel_map.phys_addr; + + for ( ; rela < (Elf64_Rela *)&__rela_dyn_end; rela++) { + Elf64_Addr addr = (rela->r_offset - va_kernel_link_pa_offset); + Elf64_Addr relocated_addr = rela->r_addend; + + if (rela->r_info != R_RISCV_RELATIVE) + continue; + + /* + * Make sure to not relocate vdso symbols like rt_sigreturn + * which are linked from the address 0 in vmlinux since + * vdso symbol addresses are actually used as an offset from + * mm->context.vdso in VDSO_OFFSET macro. + */ + if (relocated_addr >= KERNEL_LINK_ADDR) + relocated_addr += reloc_offset; + + *(Elf64_Addr *)addr = relocated_addr; + } +} +#endif /* CONFIG_RELOCATABLE */ + #ifdef CONFIG_XIP_KERNEL static void __init create_kernel_page_table(pgd_t *pgdir, __always_unused bool early) @@ -1007,6 +1048,17 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) BUG_ON((kernel_map.virt_addr + kernel_map.size) > ADDRESS_SPACE_END - SZ_4K); #endif +#ifdef CONFIG_RELOCATABLE + /* + * Early page table uses only one PUD, which makes it possible + * to map PUD_SIZE aligned on PUD_SIZE: if the relocation offset + * makes the kernel cross over a PUD_SIZE boundary, raise a bug + * since a part of the kernel would not get mapped. + */ + BUG_ON(PUD_SIZE - (kernel_map.virt_addr & (PUD_SIZE - 1)) < kernel_map.size); + relocate_kernel(); +#endif + apply_early_boot_alternatives(); pt_ops_set_early(); -- cgit v1.2.3 From 47981b5cc6871d78aee67b6c9ae70aff90ddb97d Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 29 Mar 2023 06:53:27 +0200 Subject: powerpc: Move script to check relocations at compile time in scripts/ Relocating kernel at runtime is done very early in the boot process, so it is not convenient to check for relocations there and react in case a relocation was not expected. Powerpc architecture has a script that allows to check at compile time for such unexpected relocations: extract the common logic to scripts/ so that other architectures can take advantage of it. Signed-off-by: Alexandre Ghiti Reviewed-by: Anup Patel Acked-by: Michael Ellerman (powerpc) Link: https://lore.kernel.org/r/20230329045329.64565-5-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/powerpc/tools/relocs_check.sh | 18 ++---------------- scripts/relocs_check.sh | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 16 deletions(-) create mode 100755 scripts/relocs_check.sh diff --git a/arch/powerpc/tools/relocs_check.sh b/arch/powerpc/tools/relocs_check.sh index 63792af00417..6b350e75014c 100755 --- a/arch/powerpc/tools/relocs_check.sh +++ b/arch/powerpc/tools/relocs_check.sh @@ -15,21 +15,8 @@ if [ $# -lt 3 ]; then exit 1 fi -# Have Kbuild supply the path to objdump and nm so we handle cross compilation. -objdump="$1" -nm="$2" -vmlinux="$3" - -# Remove from the bad relocations those that match an undefined weak symbol -# which will result in an absolute relocation to 0. -# Weak unresolved symbols are of that form in nm output: -# " w _binary__btf_vmlinux_bin_end" -undef_weak_symbols=$($nm "$vmlinux" | awk '$1 ~ /w/ { print $2 }') - bad_relocs=$( -$objdump -R "$vmlinux" | - # Only look at relocation lines. - grep -E '\ Date: Wed, 29 Mar 2023 06:53:28 +0200 Subject: riscv: Check relocations at compile time Relocating kernel at runtime is done very early in the boot process, so it is not convenient to check for relocations there and react in case a relocation was not expected. There exists a script in scripts/ that extracts the relocations from vmlinux that is then used at postlink to check the relocations. Signed-off-by: Alexandre Ghiti Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20230329045329.64565-6-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Makefile.postlink | 36 ++++++++++++++++++++++++++++++++++++ arch/riscv/tools/relocs_check.sh | 26 ++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 arch/riscv/Makefile.postlink create mode 100755 arch/riscv/tools/relocs_check.sh diff --git a/arch/riscv/Makefile.postlink b/arch/riscv/Makefile.postlink new file mode 100644 index 000000000000..d5de8d520d3e --- /dev/null +++ b/arch/riscv/Makefile.postlink @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: GPL-2.0 +# =========================================================================== +# Post-link riscv pass +# =========================================================================== +# +# Check that vmlinux relocations look sane + +PHONY := __archpost +__archpost: + +-include include/config/auto.conf +include $(srctree)/scripts/Kbuild.include + +quiet_cmd_relocs_check = CHKREL $@ +cmd_relocs_check = \ + $(CONFIG_SHELL) $(srctree)/arch/riscv/tools/relocs_check.sh "$(OBJDUMP)" "$(NM)" "$@" + +# `@true` prevents complaint when there is nothing to be done + +vmlinux: FORCE + @true +ifdef CONFIG_RELOCATABLE + $(call if_changed,relocs_check) +endif + +%.ko: FORCE + @true + +clean: + @true + +PHONY += FORCE clean + +FORCE: + +.PHONY: $(PHONY) diff --git a/arch/riscv/tools/relocs_check.sh b/arch/riscv/tools/relocs_check.sh new file mode 100755 index 000000000000..baeb2e7b2290 --- /dev/null +++ b/arch/riscv/tools/relocs_check.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# Based on powerpc relocs_check.sh + +# This script checks the relocations of a vmlinux for "suspicious" +# relocations. + +if [ $# -lt 3 ]; then + echo "$0 [path to objdump] [path to nm] [path to vmlinux]" 1>&2 + exit 1 +fi + +bad_relocs=$( +${srctree}/scripts/relocs_check.sh "$@" | + # These relocations are okay + # R_RISCV_RELATIVE + grep -F -w -v 'R_RISCV_RELATIVE' +) + +if [ -z "$bad_relocs" ]; then + exit 0 +fi + +num_bad=$(echo "$bad_relocs" | wc -l) +echo "WARNING: $num_bad bad relocations" +echo "$bad_relocs" -- cgit v1.2.3 From 559d1e45a16dcf1542e430ea3dce9ab625be98d0 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 29 Mar 2023 06:53:29 +0200 Subject: riscv: Use --emit-relocs in order to move .rela.dyn in init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To circumvent an issue where placing the relocations inside the init sections produces empty relocations, use --emit-relocs. But to avoid carrying those relocations in vmlinux, use an intermediate vmlinux.relocs file which is a copy of vmlinux *before* stripping its relocations. Suggested-by: Björn Töpel Suggested-by: Nick Desaulniers Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230329045329.64565-7-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Makefile | 2 +- arch/riscv/Makefile.postlink | 13 +++++++++++++ arch/riscv/boot/Makefile | 7 +++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 860b09e409c7..7dc6904a6836 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -8,7 +8,7 @@ OBJCOPYFLAGS := -O binary ifeq ($(CONFIG_RELOCATABLE),y) - LDFLAGS_vmlinux += -shared -Bsymbolic -z notext -z norelro + LDFLAGS_vmlinux += -shared -Bsymbolic -z notext -z norelro --emit-relocs KBUILD_CFLAGS += -fPIE endif ifeq ($(CONFIG_DYNAMIC_FTRACE),y) diff --git a/arch/riscv/Makefile.postlink b/arch/riscv/Makefile.postlink index d5de8d520d3e..a46fc578b30b 100644 --- a/arch/riscv/Makefile.postlink +++ b/arch/riscv/Makefile.postlink @@ -15,12 +15,25 @@ quiet_cmd_relocs_check = CHKREL $@ cmd_relocs_check = \ $(CONFIG_SHELL) $(srctree)/arch/riscv/tools/relocs_check.sh "$(OBJDUMP)" "$(NM)" "$@" +ifdef CONFIG_RELOCATABLE +quiet_cmd_cp_vmlinux_relocs = CPREL vmlinux.relocs +cmd_cp_vmlinux_relocs = cp vmlinux vmlinux.relocs + +quiet_cmd_relocs_strip = STRIPREL $@ +cmd_relocs_strip = $(OBJCOPY) --remove-section='.rel.*' \ + --remove-section='.rel__*' \ + --remove-section='.rela.*' \ + --remove-section='.rela__*' $@ +endif + # `@true` prevents complaint when there is nothing to be done vmlinux: FORCE @true ifdef CONFIG_RELOCATABLE $(call if_changed,relocs_check) + $(call if_changed,cp_vmlinux_relocs) + $(call if_changed,relocs_strip) endif %.ko: FORCE diff --git a/arch/riscv/boot/Makefile b/arch/riscv/boot/Makefile index c72de7232abb..22b13947bd13 100644 --- a/arch/riscv/boot/Makefile +++ b/arch/riscv/boot/Makefile @@ -33,7 +33,14 @@ $(obj)/xipImage: vmlinux FORCE endif +ifdef CONFIG_RELOCATABLE +vmlinux.relocs: vmlinux + @ (! [ -f vmlinux.relocs ] && echo "vmlinux.relocs can't be found, please remove vmlinux and try again") || true + +$(obj)/Image: vmlinux.relocs FORCE +else $(obj)/Image: vmlinux FORCE +endif $(call if_changed,objcopy) $(obj)/Image.gz: $(obj)/Image FORCE -- cgit v1.2.3 From bb3f89487fd936df7cc5165bae37ca2669056c5c Mon Sep 17 00:00:00 2001 From: Evan Green Date: Thu, 20 Apr 2023 12:49:34 -0700 Subject: RISC-V: hwprobe: Remove __init on probe_vendor_features() probe_vendor_features() is now called from smp_callin(), which is not __init code and runs during cpu hotplug events. Remove the __init_or_module decoration from it and the functions it calls to avoid walking into outer space. Fixes: 62a31d6e38bd ("RISC-V: hwprobe: Support probing of misaligned access performance") Signed-off-by: Evan Green Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230420194934.1871356-1-evan@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/errata/thead/errata.c | 6 +++--- arch/riscv/include/asm/alternative.h | 2 +- arch/riscv/kernel/alternative.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index 1036b8f933ec..a86c4facc2a6 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -118,9 +118,9 @@ void __init_or_module thead_errata_patch_func(struct alt_entry *begin, struct al local_flush_icache_all(); } -void __init_or_module thead_feature_probe_func(unsigned int cpu, - unsigned long archid, - unsigned long impid) +void thead_feature_probe_func(unsigned int cpu, + unsigned long archid, + unsigned long impid) { if ((archid == 0) && (impid == 0)) per_cpu(misaligned_access_speed, cpu) = RISCV_HWPROBE_MISALIGNED_FAST; diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index a8f5cf6694a1..6a41537826a7 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -30,7 +30,7 @@ #define ALT_OLD_PTR(a) __ALT_PTR(a, old_offset) #define ALT_ALT_PTR(a) __ALT_PTR(a, alt_offset) -void __init probe_vendor_features(unsigned int cpu); +void probe_vendor_features(unsigned int cpu); void __init apply_boot_alternatives(void); void __init apply_early_boot_alternatives(void); void apply_module_alternatives(void *start, size_t length); diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c index fc65c9293ac5..6b75788c18e6 100644 --- a/arch/riscv/kernel/alternative.c +++ b/arch/riscv/kernel/alternative.c @@ -31,7 +31,7 @@ struct cpu_manufacturer_info_t { unsigned long impid); }; -static void __init_or_module riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info) +static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info) { #ifdef CONFIG_RISCV_M_MODE cpu_mfr_info->vendor_id = csr_read(CSR_MVENDORID); @@ -144,7 +144,7 @@ void riscv_alternative_fix_offsets(void *alt_ptr, unsigned int len, } /* Called on each CPU as it starts */ -void __init_or_module probe_vendor_features(unsigned int cpu) +void probe_vendor_features(unsigned int cpu) { struct cpu_manufacturer_info_t cpu_mfr_info; -- cgit v1.2.3 From d4dda690b44a5e36b4bf108c48177cef40c80d24 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Mon, 24 Apr 2023 18:05:43 +0100 Subject: dt-bindings: riscv: add sv57 mmu-type Dumping the dtb from new versions of QEMU warns that sv57 is an undocumented mmu-type. The kernel has supported sv57 for about a year, so bring it into the fold. Signed-off-by: Conor Dooley Acked-by: Rob Herring Link: https://lore.kernel.org/r/20230424-rival-habitual-478567c516f0@spud Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/riscv/cpus.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/riscv/cpus.yaml b/Documentation/devicetree/bindings/riscv/cpus.yaml index f24cf9601c6e..9b512c1d93fe 100644 --- a/Documentation/devicetree/bindings/riscv/cpus.yaml +++ b/Documentation/devicetree/bindings/riscv/cpus.yaml @@ -65,6 +65,7 @@ properties: - riscv,sv32 - riscv,sv39 - riscv,sv48 + - riscv,sv57 - riscv,none riscv,cbom-block-size: -- cgit v1.2.3 From 26e7aacb83dfd04330673c5c9ac336560da52bb3 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 24 Apr 2023 11:23:13 +0200 Subject: riscv: Allow to downgrade paging mode from the command line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 2 early command line parameters that allow to downgrade satp mode (using the same naming as x86): - "no5lvl": use a 4-level page table (down from sv57 to sv48) - "no4lvl": use a 3-level page table (down from sv57/sv48 to sv39) Note that going through the device tree to get the kernel command line works with ACPI too since the efi stub creates a device tree anyway with the command line. In KASAN kernels, we can't use the libfdt that early in the boot process since we are not ready to execute instrumented functions. So instead of using the "generic" libfdt, we compile our own versions of those functions that are not instrumented and that are prefixed so that they do not conflict with the generic ones. We also need the non-instrumented versions of the string functions and the prefixed versions of memcpy/memmove. This is largely inspired by commit aacd149b6238 ("arm64: head: avoid relocating the kernel twice for KASLR") from which I removed compilation flags that were not relevant to RISC-V at the moment (LTO, SCS). Also note that we have to link with -z norelro to avoid ld.lld to throw a warning with the new .got sections, like in commit 311bea3cb9ee ("arm64: link with -z norelro for LLD or aarch64-elf"). Signed-off-by: Alexandre Ghiti Tested-by: Björn Töpel Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20230424092313.178699-2-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/admin-guide/kernel-parameters.txt | 5 +- arch/riscv/Makefile | 3 +- arch/riscv/kernel/Makefile | 2 + arch/riscv/kernel/pi/Makefile | 39 ++++++++++++++++ arch/riscv/kernel/pi/cmdline_early.c | 62 +++++++++++++++++++++++++ arch/riscv/kernel/vmlinux.lds.S | 11 ++++- arch/riscv/lib/memcpy.S | 2 + arch/riscv/lib/memmove.S | 2 + arch/riscv/lib/strlen.S | 1 + arch/riscv/mm/init.c | 36 +++++++++++--- 10 files changed, 154 insertions(+), 9 deletions(-) create mode 100644 arch/riscv/kernel/pi/Makefile create mode 100644 arch/riscv/kernel/pi/cmdline_early.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6221a1d057dd..accc400b43f1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3576,7 +3576,10 @@ emulation library even if a 387 maths coprocessor is present. - no5lvl [X86-64] Disable 5-level paging mode. Forces + no4lvl [RISCV] Disable 4-level and 5-level paging modes. Forces + kernel to use 3-level paging instead. + + no5lvl [X86-64,RISCV] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. nofsgsbase [X86] Disables FSGSBASE instructions. diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index e859e1721a8f..d44d0fb98168 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -7,8 +7,9 @@ # OBJCOPYFLAGS := -O binary +LDFLAGS_vmlinux := -z norelro ifeq ($(CONFIG_RELOCATABLE),y) - LDFLAGS_vmlinux += -shared -Bsymbolic -z notext -z norelro --emit-relocs + LDFLAGS_vmlinux += -shared -Bsymbolic -z notext --emit-relocs KBUILD_CFLAGS += -fPIE endif ifeq ($(CONFIG_DYNAMIC_FTRACE),y) diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 392fa6e35d4a..0fee73a20c87 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -87,3 +87,5 @@ obj-$(CONFIG_EFI) += efi.o obj-$(CONFIG_COMPAT) += compat_syscall_table.o obj-$(CONFIG_COMPAT) += compat_signal.o obj-$(CONFIG_COMPAT) += compat_vdso/ + +obj-$(CONFIG_64BIT) += pi/ diff --git a/arch/riscv/kernel/pi/Makefile b/arch/riscv/kernel/pi/Makefile new file mode 100644 index 000000000000..5d7cb991f2b8 --- /dev/null +++ b/arch/riscv/kernel/pi/Makefile @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: GPL-2.0 +# This file was copied from arm64/kernel/pi/Makefile. + +KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ + -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \ + $(call cc-option,-mbranch-protection=none) \ + -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \ + -D__DISABLE_EXPORTS -ffreestanding \ + -fno-asynchronous-unwind-tables -fno-unwind-tables \ + $(call cc-option,-fno-addrsig) + +KBUILD_CFLAGS += -mcmodel=medany + +CFLAGS_cmdline_early.o += -D__NO_FORTIFY +CFLAGS_lib-fdt_ro.o += -D__NO_FORTIFY + +GCOV_PROFILE := n +KASAN_SANITIZE := n +KCSAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ \ + --remove-section=.note.gnu.property \ + --prefix-alloc-sections=.init +$(obj)/%.pi.o: $(obj)/%.o FORCE + $(call if_changed,objcopy) + +$(obj)/lib-%.o: $(srctree)/lib/%.c FORCE + $(call if_changed_rule,cc_o_c) + +$(obj)/string.o: $(srctree)/lib/string.c FORCE + $(call if_changed_rule,cc_o_c) + +$(obj)/ctype.o: $(srctree)/lib/ctype.c FORCE + $(call if_changed_rule,cc_o_c) + +obj-y := cmdline_early.pi.o string.pi.o ctype.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o +extra-y := $(patsubst %.pi.o,%.o,$(obj-y)) diff --git a/arch/riscv/kernel/pi/cmdline_early.c b/arch/riscv/kernel/pi/cmdline_early.c new file mode 100644 index 000000000000..05652d13c746 --- /dev/null +++ b/arch/riscv/kernel/pi/cmdline_early.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include + +static char early_cmdline[COMMAND_LINE_SIZE]; + +/* + * Declare the functions that are exported (but prefixed) here so that LLVM + * does not complain it lacks the 'static' keyword (which, if added, makes + * LLVM complain because the function is actually unused in this file). + */ +u64 set_satp_mode_from_cmdline(uintptr_t dtb_pa); + +static char *get_early_cmdline(uintptr_t dtb_pa) +{ + const char *fdt_cmdline = NULL; + unsigned int fdt_cmdline_size = 0; + int chosen_node; + + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { + chosen_node = fdt_path_offset((void *)dtb_pa, "/chosen"); + if (chosen_node >= 0) { + fdt_cmdline = fdt_getprop((void *)dtb_pa, chosen_node, + "bootargs", NULL); + if (fdt_cmdline) { + fdt_cmdline_size = strlen(fdt_cmdline); + strscpy(early_cmdline, fdt_cmdline, + COMMAND_LINE_SIZE); + } + } + } + + if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) || + IS_ENABLED(CONFIG_CMDLINE_FORCE) || + fdt_cmdline_size == 0 /* CONFIG_CMDLINE_FALLBACK */) { + strncat(early_cmdline, CONFIG_CMDLINE, + COMMAND_LINE_SIZE - fdt_cmdline_size); + } + + return early_cmdline; +} + +static u64 match_noXlvl(char *cmdline) +{ + if (strstr(cmdline, "no4lvl")) + return SATP_MODE_48; + else if (strstr(cmdline, "no5lvl")) + return SATP_MODE_57; + + return 0; +} + +u64 set_satp_mode_from_cmdline(uintptr_t dtb_pa) +{ + char *cmdline = get_early_cmdline(dtb_pa); + + return match_noXlvl(cmdline); +} diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 615ff5842690..305877d85e96 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -83,6 +83,14 @@ SECTIONS /* Start of init data section */ __init_data_begin = .; INIT_DATA_SECTION(16) + + /* Those sections result from the compilation of kernel/pi/string.c */ + .init.pidata : { + *(.init.srodata.cst8*) + *(.init__bug_table*) + *(.init.sdata*) + } + .init.bss : { *(.init.bss) /* from the EFI stub */ } @@ -128,9 +136,10 @@ SECTIONS __rela_dyn_end = .; } + .got : { *(.got*) } + #ifdef CONFIG_RELOCATABLE .data.rel : { *(.data.rel*) } - .got : { *(.got*) } .plt : { *(.plt) } .dynamic : { *(.dynamic) } .dynsym : { *(.dynsym) } diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S index 51ab716253fa..1a40d01a9543 100644 --- a/arch/riscv/lib/memcpy.S +++ b/arch/riscv/lib/memcpy.S @@ -106,3 +106,5 @@ WEAK(memcpy) 6: ret END(__memcpy) +SYM_FUNC_ALIAS(__pi_memcpy, __memcpy) +SYM_FUNC_ALIAS(__pi___memcpy, __memcpy) diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S index e0609e1f0864..838ff2022fe3 100644 --- a/arch/riscv/lib/memmove.S +++ b/arch/riscv/lib/memmove.S @@ -314,3 +314,5 @@ return_from_memmove: SYM_FUNC_END(memmove) SYM_FUNC_END(__memmove) +SYM_FUNC_ALIAS(__pi_memmove, __memmove) +SYM_FUNC_ALIAS(__pi___memmove, __memmove) diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S index db3d42d99b78..8ae3064e45ff 100644 --- a/arch/riscv/lib/strlen.S +++ b/arch/riscv/lib/strlen.S @@ -130,3 +130,4 @@ strlen_zbb: .option pop #endif SYM_FUNC_END(strlen) +SYM_FUNC_ALIAS(__pi_strlen, strlen) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index bce899b180cd..3ad771571c2d 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -746,6 +746,8 @@ static __init pgprot_t pgprot_from_va(uintptr_t va) #endif /* CONFIG_STRICT_KERNEL_RWX */ #if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) +u64 __pi_set_satp_mode_from_cmdline(uintptr_t dtb_pa); + static void __init disable_pgtable_l5(void) { pgtable_l5_enabled = false; @@ -760,17 +762,39 @@ static void __init disable_pgtable_l4(void) satp_mode = SATP_MODE_39; } +static int __init print_no4lvl(char *p) +{ + pr_info("Disabled 4-level and 5-level paging"); + return 0; +} +early_param("no4lvl", print_no4lvl); + +static int __init print_no5lvl(char *p) +{ + pr_info("Disabled 5-level paging"); + return 0; +} +early_param("no5lvl", print_no5lvl); + /* * There is a simple way to determine if 4-level is supported by the * underlying hardware: establish 1:1 mapping in 4-level page table mode * then read SATP to see if the configuration was taken into account * meaning sv48 is supported. */ -static __init void set_satp_mode(void) +static __init void set_satp_mode(uintptr_t dtb_pa) { u64 identity_satp, hw_satp; uintptr_t set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; - bool check_l4 = false; + u64 satp_mode_cmdline = __pi_set_satp_mode_from_cmdline(dtb_pa); + + if (satp_mode_cmdline == SATP_MODE_57) { + disable_pgtable_l5(); + } else if (satp_mode_cmdline == SATP_MODE_48) { + disable_pgtable_l5(); + disable_pgtable_l4(); + return; + } create_p4d_mapping(early_p4d, set_satp_mode_pmd, (uintptr_t)early_pud, @@ -789,7 +813,8 @@ static __init void set_satp_mode(void) retry: create_pgd_mapping(early_pg_dir, set_satp_mode_pmd, - check_l4 ? (uintptr_t)early_pud : (uintptr_t)early_p4d, + pgtable_l5_enabled ? + (uintptr_t)early_p4d : (uintptr_t)early_pud, PGDIR_SIZE, PAGE_TABLE); identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode; @@ -800,9 +825,8 @@ retry: local_flush_tlb_all(); if (hw_satp != identity_satp) { - if (!check_l4) { + if (pgtable_l5_enabled) { disable_pgtable_l5(); - check_l4 = true; memset(early_pg_dir, 0, PAGE_SIZE); goto retry; } @@ -1031,7 +1055,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) #endif #if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) - set_satp_mode(); + set_satp_mode(dtb_pa); #endif /* -- cgit v1.2.3 From 08dc107594681040587c23a097cfa678e51f5af2 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 26 Apr 2023 16:13:32 +0200 Subject: RISC-V: hwprobe: There can only be one first Only capture the first cpu_id in order for the comparison below to be of any use. Fixes: ea3de9ce8aa2 ("RISC-V: Add a syscall for HW probing") Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Evan Green Link: https://lore.kernel.org/r/20230426141333.10063-2-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sys_riscv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index 849b4170629d..c569dac7452e 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -103,8 +103,10 @@ static void hwprobe_arch_id(struct riscv_hwprobe *pair, break; } - if (first) + if (first) { id = cpu_id; + first = false; + } /* * If there's a mismatch for the given set, return -1 in the -- cgit v1.2.3 From b09313dd2e726fe5e1fa574cd73f5e326c6030a4 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Wed, 26 Apr 2023 16:13:33 +0200 Subject: RISC-V: hwprobe: Explicity check for -1 in vdso init id_bitsmash is unsigned. We need to explicitly check for -1, rather than use > 0. Fixes: aa5af0aa90ba ("RISC-V: Add hwprobe vDSO function and data") Signed-off-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Evan Green Link: https://lore.kernel.org/r/20230426141333.10063-3-ajones@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sys_riscv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index c569dac7452e..5db29683ebee 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -282,7 +282,7 @@ static int __init init_hwprobe_vdso_data(void) * populated) or any value returns -1 (varies across CPUs), then the * vDSO should defer to the kernel for exotic cpu masks. */ - avd->homogeneous_cpus = (id_bitsmash > 0); + avd->homogeneous_cpus = id_bitsmash != 0 && id_bitsmash != -1; return 0; } -- cgit v1.2.3