summaryrefslogtreecommitdiff
path: root/arch/s390/mm
diff options
context:
space:
mode:
authorAlexander Gordeev <agordeev@linux.ibm.com>2022-12-13 13:35:11 +0300
committerHeiko Carstens <hca@linux.ibm.com>2023-01-13 16:15:05 +0300
commitbb1520d581a3a46e2d6e12bb74604ace33404de5 (patch)
treee30707db7e1375a9b4bb8bd40102e57aee8c968a /arch/s390/mm
parentbd50b7436217b4123911c2bca1efd74718654f06 (diff)
downloadlinux-bb1520d581a3a46e2d6e12bb74604ace33404de5.tar.xz
s390/mm: start kernel with DAT enabled
The setup of the kernel virtual address space is spread throughout the sources, boot stages and config options like this: 1. The available physical memory regions are queried and stored as mem_detect information for later use in the decompressor. 2. Based on the physical memory availability the virtual memory layout is established in the decompressor; 3. If CONFIG_KASAN is disabled the kernel paging setup code populates kernel pgtables and turns DAT mode on. It uses the information stored at step [1]. 4. If CONFIG_KASAN is enabled the kernel early boot kasan setup populates kernel pgtables and turns DAT mode on. It uses the information stored at step [1]. The kasan setup creates early_pg_dir directory and directly overwrites swapper_pg_dir entries to make shadow memory pages available. Move the kernel virtual memory setup to the decompressor and start the kernel with DAT turned on right from the very first istruction. That completely eliminates the boot phase when the kernel runs in DAT-off mode, simplies the overall design and consolidates pgtables setup. The identity mapping is created in the decompressor, while kasan shadow mappings are still created by the early boot kernel code. Share with decompressor the existing kasan memory allocator. It decreases the size of a newly requested memory block from pgalloc_pos and ensures that kernel image is not overwritten. pgalloc_low and pgalloc_pos pointers are made preserved boot variables for that. Use the bootdata infrastructure to setup swapper_pg_dir and invalid_pg_dir directories used by the kernel later. The interim early_pg_dir directory established by the kasan initialization code gets eliminated as result. As the kernel runs in DAT-on mode only the PSW_KERNEL_BITS define gets PSW_MASK_DAT bit by default. Additionally, the setup_lowcore_dat_off() and setup_lowcore_dat_on() routines get merged, since there is no DAT-off mode stage anymore. The memory mappings are created with RW+X protection that allows the early boot code setting up all necessary data and services for the kernel being booted. Just before the paging is enabled the memory protection is changed to RO+X for text, RO+NX for read-only data and RW+NX for kernel data and the identity mapping. Reviewed-by: Heiko Carstens <hca@linux.ibm.com> Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Diffstat (limited to 'arch/s390/mm')
-rw-r--r--arch/s390/mm/init.c35
-rw-r--r--arch/s390/mm/kasan_init.c85
-rw-r--r--arch/s390/mm/vmem.c96
3 files changed, 96 insertions, 120 deletions
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 30ab55f868f6..144447d5cb4c 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -52,9 +52,9 @@
#include <linux/virtio_config.h>
pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
-static pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
+pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
-unsigned long s390_invalid_asce;
+unsigned long __bootdata_preserved(s390_invalid_asce);
unsigned long empty_zero_page, zero_page_mask;
EXPORT_SYMBOL(empty_zero_page);
@@ -93,37 +93,8 @@ static void __init setup_zero_pages(void)
void __init paging_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
- unsigned long pgd_type, asce_bits;
- psw_t psw;
-
- s390_invalid_asce = (unsigned long)invalid_pg_dir;
- s390_invalid_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
- crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY);
- init_mm.pgd = swapper_pg_dir;
- if (VMALLOC_END > _REGION2_SIZE) {
- asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
- pgd_type = _REGION2_ENTRY_EMPTY;
- } else {
- asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
- pgd_type = _REGION3_ENTRY_EMPTY;
- }
- init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
- S390_lowcore.kernel_asce = init_mm.context.asce;
- S390_lowcore.user_asce = s390_invalid_asce;
- crst_table_init((unsigned long *) init_mm.pgd, pgd_type);
- vmem_map_init();
- kasan_copy_shadow_mapping();
-
- /* enable virtual mapping in kernel mode */
- __ctl_load(S390_lowcore.kernel_asce, 1, 1);
- __ctl_load(S390_lowcore.user_asce, 7, 7);
- __ctl_load(S390_lowcore.kernel_asce, 13, 13);
- psw.mask = __extract_psw();
- psw_bits(psw).dat = 1;
- psw_bits(psw).as = PSW_BITS_AS_HOME;
- __load_psw_mask(psw.mask);
- kasan_free_early_identity();
+ vmem_map_init();
sparse_init();
zone_dma_bits = 31;
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c
index a97b7981358e..801d81c189a7 100644
--- a/arch/s390/mm/kasan_init.c
+++ b/arch/s390/mm/kasan_init.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kasan.h>
#include <linux/sched/task.h>
-#include <linux/memblock.h>
#include <linux/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/kasan.h>
@@ -15,16 +14,11 @@
static unsigned long segment_pos __initdata;
static unsigned long segment_low __initdata;
-static unsigned long pgalloc_pos __initdata;
-static unsigned long pgalloc_low __initdata;
-static unsigned long pgalloc_freeable __initdata;
static bool has_edat __initdata;
static bool has_nx __initdata;
#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x))
-static pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
-
static void __init kasan_early_panic(const char *reason)
{
sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n");
@@ -229,29 +223,6 @@ static void __init kasan_early_pgtable_populate(unsigned long address,
}
}
-static void __init kasan_set_pgd(pgd_t *pgd, unsigned long asce_type)
-{
- unsigned long asce_bits;
-
- asce_bits = asce_type | _ASCE_TABLE_LENGTH;
- S390_lowcore.kernel_asce = (__pa(pgd) & PAGE_MASK) | asce_bits;
- S390_lowcore.user_asce = S390_lowcore.kernel_asce;
-
- __ctl_load(S390_lowcore.kernel_asce, 1, 1);
- __ctl_load(S390_lowcore.kernel_asce, 7, 7);
- __ctl_load(S390_lowcore.kernel_asce, 13, 13);
-}
-
-static void __init kasan_enable_dat(void)
-{
- psw_t psw;
-
- psw.mask = __extract_psw();
- psw_bits(psw).dat = 1;
- psw_bits(psw).as = PSW_BITS_AS_HOME;
- __load_psw_mask(psw.mask);
-}
-
static void __init kasan_early_detect_facilities(void)
{
if (test_facility(8)) {
@@ -272,7 +243,6 @@ void __init kasan_early_init(void)
p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY);
unsigned long untracked_end = MODULES_VADDR;
unsigned long shadow_alloc_size;
- unsigned long initrd_end;
unsigned long memsize;
kasan_early_detect_facilities();
@@ -298,36 +268,24 @@ void __init kasan_early_init(void)
BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE));
BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE));
- crst_table_init((unsigned long *)early_pg_dir, _REGION2_ENTRY_EMPTY);
/* init kasan zero shadow */
- crst_table_init((unsigned long *)kasan_early_shadow_p4d,
- p4d_val(p4d_z));
- crst_table_init((unsigned long *)kasan_early_shadow_pud,
- pud_val(pud_z));
- crst_table_init((unsigned long *)kasan_early_shadow_pmd,
- pmd_val(pmd_z));
+ crst_table_init((unsigned long *)kasan_early_shadow_p4d, p4d_val(p4d_z));
+ crst_table_init((unsigned long *)kasan_early_shadow_pud, pud_val(pud_z));
+ crst_table_init((unsigned long *)kasan_early_shadow_pmd, pmd_val(pmd_z));
memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE);
shadow_alloc_size = memsize >> KASAN_SHADOW_SCALE_SHIFT;
- pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE);
- if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) {
- initrd_end =
- round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE);
- pgalloc_low = max(pgalloc_low, initrd_end);
- }
if (pgalloc_low + shadow_alloc_size > memsize)
kasan_early_panic("out of memory during initialisation\n");
if (has_edat) {
- segment_pos = round_down(memsize, _SEGMENT_SIZE);
+ segment_pos = round_down(pgalloc_pos, _SEGMENT_SIZE);
segment_low = segment_pos - shadow_alloc_size;
+ segment_low = round_down(segment_low, _SEGMENT_SIZE);
pgalloc_pos = segment_low;
- } else {
- pgalloc_pos = memsize;
}
- init_mm.pgd = early_pg_dir;
/*
* Current memory layout:
* +- 0 -------------+ +- shadow start -+
@@ -376,40 +334,7 @@ void __init kasan_early_init(void)
POPULATE_ZERO_SHADOW);
kasan_early_pgtable_populate(__sha(MODULES_END), __sha(_REGION1_SIZE),
POPULATE_ZERO_SHADOW);
- /* memory allocated for identity mapping structs will be freed later */
- pgalloc_freeable = pgalloc_pos;
- /* populate identity mapping */
- kasan_early_pgtable_populate(0, memsize, POPULATE_ONE2ONE);
- kasan_set_pgd(early_pg_dir, _ASCE_TYPE_REGION2);
- kasan_enable_dat();
/* enable kasan */
init_task.kasan_depth = 0;
- memblock_reserve(pgalloc_pos, memsize - pgalloc_pos);
sclp_early_printk("KernelAddressSanitizer initialized\n");
}
-
-void __init kasan_copy_shadow_mapping(void)
-{
- /*
- * At this point we are still running on early pages setup early_pg_dir,
- * while swapper_pg_dir has just been initialized with identity mapping.
- * Carry over shadow memory region from early_pg_dir to swapper_pg_dir.
- */
-
- pgd_t *pg_dir_src;
- pgd_t *pg_dir_dst;
- p4d_t *p4_dir_src;
- p4d_t *p4_dir_dst;
-
- pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START);
- pg_dir_dst = pgd_offset_raw(init_mm.pgd, KASAN_SHADOW_START);
- p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START);
- p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START);
- memcpy(p4_dir_dst, p4_dir_src,
- (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t));
-}
-
-void __init kasan_free_early_identity(void)
-{
- memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
-}
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index ee1a97078527..78d7768f93d7 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -11,6 +11,7 @@
#include <linux/list.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
+#include <linux/sort.h>
#include <asm/cacheflush.h>
#include <asm/nospec-branch.h>
#include <asm/pgalloc.h>
@@ -657,6 +658,29 @@ void vmem_unmap_4k_page(unsigned long addr)
mutex_unlock(&vmem_mutex);
}
+static int __init memblock_region_cmp(const void *a, const void *b)
+{
+ const struct memblock_region *r1 = a;
+ const struct memblock_region *r2 = b;
+
+ if (r1->base < r2->base)
+ return -1;
+ if (r1->base > r2->base)
+ return 1;
+ return 0;
+}
+
+static void __init memblock_region_swap(void *a, void *b, int size)
+{
+ struct memblock_region *r1 = a;
+ struct memblock_region *r2 = b;
+ struct memblock_region swap;
+
+ swap = *r1;
+ *r1 = *r2;
+ *r2 = swap;
+}
+
/*
* map whole physical memory to virtual memory (identity mapping)
* we reserve enough space in the vmalloc area for vmemmap to hotplug
@@ -664,11 +688,68 @@ void vmem_unmap_4k_page(unsigned long addr)
*/
void __init vmem_map_init(void)
{
+ struct memblock_region memory_rwx_regions[] = {
+ {
+ .base = 0,
+ .size = sizeof(struct lowcore),
+ .flags = MEMBLOCK_NONE,
+#ifdef CONFIG_NUMA
+ .nid = NUMA_NO_NODE,
+#endif
+ },
+ {
+ .base = __pa(_stext),
+ .size = _etext - _stext,
+ .flags = MEMBLOCK_NONE,
+#ifdef CONFIG_NUMA
+ .nid = NUMA_NO_NODE,
+#endif
+ },
+ {
+ .base = __pa(_sinittext),
+ .size = _einittext - _sinittext,
+ .flags = MEMBLOCK_NONE,
+#ifdef CONFIG_NUMA
+ .nid = NUMA_NO_NODE,
+#endif
+ },
+ {
+ .base = __stext_amode31,
+ .size = __etext_amode31 - __stext_amode31,
+ .flags = MEMBLOCK_NONE,
+#ifdef CONFIG_NUMA
+ .nid = NUMA_NO_NODE,
+#endif
+ },
+ };
+ struct memblock_type memory_rwx = {
+ .regions = memory_rwx_regions,
+ .cnt = ARRAY_SIZE(memory_rwx_regions),
+ .max = ARRAY_SIZE(memory_rwx_regions),
+ };
phys_addr_t base, end;
u64 i;
- for_each_mem_range(i, &base, &end)
- vmem_add_range(base, end - base);
+ /*
+ * Set RW+NX attribute on all memory, except regions enumerated with
+ * memory_rwx exclude type. These regions need different attributes,
+ * which are enforced afterwards.
+ *
+ * __for_each_mem_range() iterate and exclude types should be sorted.
+ * The relative location of _stext and _sinittext is hardcoded in the
+ * linker script. However a location of __stext_amode31 and the kernel
+ * image itself are chosen dynamically. Thus, sort the exclude type.
+ */
+ sort(&memory_rwx_regions,
+ ARRAY_SIZE(memory_rwx_regions), sizeof(memory_rwx_regions[0]),
+ memblock_region_cmp, memblock_region_swap);
+ __for_each_mem_range(i, &memblock.memory, &memory_rwx,
+ NUMA_NO_NODE, MEMBLOCK_NONE, &base, &end, NULL) {
+ __set_memory((unsigned long)__va(base),
+ (end - base) >> PAGE_SHIFT,
+ SET_MEMORY_RW | SET_MEMORY_NX);
+ }
+
__set_memory((unsigned long)_stext,
(unsigned long)(_etext - _stext) >> PAGE_SHIFT,
SET_MEMORY_RO | SET_MEMORY_X);
@@ -678,15 +759,14 @@ void __init vmem_map_init(void)
__set_memory((unsigned long)_sinittext,
(unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
SET_MEMORY_RO | SET_MEMORY_X);
- __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT,
+ __set_memory(__stext_amode31,
+ (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT,
SET_MEMORY_RO | SET_MEMORY_X);
- /* lowcore requires 4k mapping for real addresses / prefixing */
- set_memory_4k(0, LC_PAGES);
-
/* lowcore must be executable for LPSWE */
- if (!static_key_enabled(&cpu_has_bear))
- set_memory_x(0, 1);
+ if (static_key_enabled(&cpu_has_bear))
+ set_memory_nx(0, 1);
+ set_memory_nx(PAGE_SIZE, 1);
pr_info("Write protected kernel read-only data: %luk\n",
(unsigned long)(__end_rodata - _stext) >> 10);