summaryrefslogtreecommitdiff
path: root/arch/s390
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390')
-rw-r--r--arch/s390/Kconfig65
-rw-r--r--arch/s390/Makefile15
-rw-r--r--arch/s390/boot/Makefile9
-rw-r--r--arch/s390/boot/boot.h14
-rw-r--r--arch/s390/boot/decompressor.c15
-rw-r--r--arch/s390/boot/decompressor.h8
-rw-r--r--arch/s390/boot/kaslr.c2
-rw-r--r--arch/s390/boot/pgm_check_info.c4
-rw-r--r--arch/s390/boot/startup.c263
-rw-r--r--arch/s390/boot/vmem.c108
-rw-r--r--arch/s390/boot/vmlinux.lds.S28
-rw-r--r--arch/s390/include/asm/ap.h30
-rw-r--r--arch/s390/include/asm/asm-prototypes.h1
-rw-r--r--arch/s390/include/asm/chsc.h15
-rw-r--r--arch/s390/include/asm/extmem.h7
-rw-r--r--arch/s390/include/asm/ftrace.h8
-rw-r--r--arch/s390/include/asm/gmap.h2
-rw-r--r--arch/s390/include/asm/mmu.h5
-rw-r--r--arch/s390/include/asm/mmu_context.h1
-rw-r--r--arch/s390/include/asm/nospec-branch.h20
-rw-r--r--arch/s390/include/asm/nospec-insn.h13
-rw-r--r--arch/s390/include/asm/os_info.h29
-rw-r--r--arch/s390/include/asm/page.h50
-rw-r--r--arch/s390/include/asm/pgtable.h22
-rw-r--r--arch/s390/include/asm/physmem_info.h4
-rw-r--r--arch/s390/include/asm/setup.h14
-rw-r--r--arch/s390/kernel/Makefile2
-rw-r--r--arch/s390/kernel/crash_dump.c41
-rw-r--r--arch/s390/kernel/ipl.c6
-rw-r--r--arch/s390/kernel/nospec-branch.c4
-rw-r--r--arch/s390/kernel/os_info.c29
-rw-r--r--arch/s390/kernel/perf_cpum_cf.c2
-rw-r--r--arch/s390/kernel/perf_cpum_cf_events.c11
-rw-r--r--arch/s390/kernel/setup.c6
-rw-r--r--arch/s390/kernel/stacktrace.c19
-rw-r--r--arch/s390/kernel/uv.c51
-rw-r--r--arch/s390/kernel/vmcore_info.c2
-rw-r--r--arch/s390/kernel/vmlinux.lds.S38
-rw-r--r--arch/s390/kvm/kvm-s390.c4
-rw-r--r--arch/s390/kvm/vsie.c5
-rw-r--r--arch/s390/lib/Makefile2
-rw-r--r--arch/s390/lib/expoline.S (renamed from arch/s390/lib/expoline/expoline.S)0
-rw-r--r--arch/s390/lib/expoline/Makefile3
-rw-r--r--arch/s390/mm/gmap.c165
-rw-r--r--arch/s390/mm/vmem.c5
-rw-r--r--arch/s390/pci/pci_sysfs.c4
-rw-r--r--arch/s390/tools/relocs.c2
47 files changed, 698 insertions, 455 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 8f01ada6845e..7e7fe89c9b25 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -17,6 +17,9 @@ config ARCH_HAS_ILOG2_U32
config ARCH_HAS_ILOG2_U64
def_bool n
+config ARCH_PROC_KCORE_TEXT
+ def_bool y
+
config GENERIC_HWEIGHT
def_bool y
@@ -552,7 +555,7 @@ config EXPOLINE
If unsure, say N.
config EXPOLINE_EXTERN
- def_bool n
+ def_bool y if EXPOLINE
depends on EXPOLINE
depends on CC_IS_GCC && GCC_VERSION >= 110200
depends on $(success,$(srctree)/arch/s390/tools/gcc-thunk-extern.sh $(CC))
@@ -590,18 +593,6 @@ config RELOCATABLE
Note: this option exists only for documentation purposes, please do
not remove it.
-config PIE_BUILD
- def_bool CC_IS_CLANG && !$(cc-option,-munaligned-symbols)
- help
- If the compiler is unable to generate code that can manage unaligned
- symbols, the kernel is linked as a position-independent executable
- (PIE) and includes dynamic relocations that are processed early
- during bootup.
-
- For kpatch functionality, it is recommended to build the kernel
- without the PIE_BUILD option. PIE_BUILD is only enabled when the
- compiler lacks proper support for handling unaligned symbols.
-
config RANDOMIZE_BASE
bool "Randomize the address of the kernel image (KASLR)"
default y
@@ -611,6 +602,25 @@ config RANDOMIZE_BASE
as a security feature that deters exploit attempts relying on
knowledge of the location of kernel internals.
+config KERNEL_IMAGE_BASE
+ hex "Kernel image base address"
+ range 0x100000 0x1FFFFFE0000000 if !KASAN
+ range 0x100000 0x1BFFFFE0000000 if KASAN
+ default 0x3FFE0000000 if !KASAN
+ default 0x7FFFE0000000 if KASAN
+ help
+ This is the address at which the kernel image is loaded in case
+ Kernel Address Space Layout Randomization (KASLR) is disabled.
+
+ In case the Protected virtualization guest support is enabled the
+ Ultravisor imposes a virtual address limit. If the value of this
+ option leads to the kernel image exceeding the Ultravisor limit,
+ this option is ignored and the image is loaded below the limit.
+
+ If the value of this option leads to the kernel image overlapping
+ the virtual memory where other data structures are located, this
+ option is ignored and the image is loaded above the structures.
+
endmenu
menu "Memory setup"
@@ -724,6 +734,33 @@ config EADM_SCH
To compile this driver as a module, choose M here: the
module will be called eadm_sch.
+config AP
+ def_tristate y
+ prompt "Support for Adjunct Processors (ap)"
+ help
+ This driver allows usage to Adjunct Processor (AP) devices via
+ the ap bus, cards and queues. Supported Adjunct Processors are
+ the CryptoExpress Cards (CEX).
+
+ To compile this driver as a module, choose M here: the
+ module will be called ap.
+
+ If unsure, say Y (default).
+
+config AP_DEBUG
+ def_bool n
+ prompt "Enable debug features for Adjunct Processor (ap) devices"
+ depends on AP
+ help
+ Say 'Y' here to enable some additional debug features for Adjunct
+ Processor (ap) devices.
+
+ There will be some more sysfs attributes displayed for ap queues.
+
+ Do not enable on production level kernel build.
+
+ If unsure, say N.
+
config VFIO_CCW
def_tristate n
prompt "Support for VFIO-CCW subchannels"
@@ -740,7 +777,7 @@ config VFIO_AP
prompt "VFIO support for AP devices"
depends on KVM
depends on VFIO
- depends on ZCRYPT
+ depends on AP
select VFIO_MDEV
help
This driver grants access to Adjunct Processor (AP) devices
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 2dbb2d2f22f9..f2b21c7a70ef 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -14,14 +14,9 @@ KBUILD_AFLAGS_MODULE += -fPIC
KBUILD_CFLAGS_MODULE += -fPIC
KBUILD_AFLAGS += -m64
KBUILD_CFLAGS += -m64
-ifdef CONFIG_PIE_BUILD
-KBUILD_CFLAGS += -fPIE
-LDFLAGS_vmlinux := -pie -z notext
-else
-KBUILD_CFLAGS += $(call cc-option,-munaligned-symbols,)
-LDFLAGS_vmlinux := --emit-relocs --discard-none
+KBUILD_CFLAGS += -fPIC
+LDFLAGS_vmlinux := -no-pie --emit-relocs --discard-none
extra_tools := relocs
-endif
aflags_dwarf := -Wa,-gdwarf-2
KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__
ifndef CONFIG_AS_IS_LLVM
@@ -88,7 +83,6 @@ endif
ifdef CONFIG_EXPOLINE
ifdef CONFIG_EXPOLINE_EXTERN
- KBUILD_LDFLAGS_MODULE += arch/s390/lib/expoline/expoline.o
CC_FLAGS_EXPOLINE := -mindirect-branch=thunk-extern
CC_FLAGS_EXPOLINE += -mfunction-return=thunk-extern
else
@@ -167,11 +161,6 @@ vdso_prepare: prepare0
vdso-install-y += arch/s390/kernel/vdso64/vdso64.so.dbg
vdso-install-$(CONFIG_COMPAT) += arch/s390/kernel/vdso32/vdso32.so.dbg
-ifdef CONFIG_EXPOLINE_EXTERN
-modules_prepare: expoline_prepare
-expoline_prepare: scripts
- $(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o
-endif
endif
# Don't use tabs in echo arguments
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index 294f08a8811a..070c9b2e905f 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -37,8 +37,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
obj-y := head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o
obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
-obj-y += version.o pgm_check_info.o ctype.o ipl_data.o
-obj-y += $(if $(CONFIG_PIE_BUILD),machine_kexec_reloc.o,relocs.o)
+obj-y += version.o pgm_check_info.o ctype.o ipl_data.o relocs.o
obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o
@@ -49,9 +48,7 @@ targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y
targets += vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2
targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4
targets += vmlinux.bin.zst info.bin syms.bin vmlinux.syms $(obj-all)
-ifndef CONFIG_PIE_BUILD
targets += relocs.S
-endif
OBJECTS := $(addprefix $(obj)/,$(obj-y))
OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all))
@@ -110,13 +107,11 @@ OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section
$(obj)/vmlinux.bin: vmlinux FORCE
$(call if_changed,objcopy)
-ifndef CONFIG_PIE_BUILD
CMD_RELOCS=arch/s390/tools/relocs
-quiet_cmd_relocs = RELOCS $@
+quiet_cmd_relocs = RELOCS $@
cmd_relocs = $(CMD_RELOCS) $< > $@
$(obj)/relocs.S: vmlinux FORCE
$(call if_changed,relocs)
-endif
suffix-$(CONFIG_KERNEL_GZIP) := .gz
suffix-$(CONFIG_KERNEL_BZIP2) := .bz2
diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h
index 567d60f78bbc..18027fdc92b0 100644
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -17,7 +17,6 @@ struct machine_info {
};
struct vmlinux_info {
- unsigned long default_lma;
unsigned long entry;
unsigned long image_size; /* does not include .bss */
unsigned long bss_size; /* uncompressed image .bss size */
@@ -25,14 +24,8 @@ struct vmlinux_info {
unsigned long bootdata_size;
unsigned long bootdata_preserved_off;
unsigned long bootdata_preserved_size;
-#ifdef CONFIG_PIE_BUILD
- unsigned long dynsym_start;
- unsigned long rela_dyn_start;
- unsigned long rela_dyn_end;
-#else
unsigned long got_start;
unsigned long got_end;
-#endif
unsigned long amode31_size;
unsigned long init_mm_off;
unsigned long swapper_pg_dir_off;
@@ -74,10 +67,11 @@ void sclp_early_setup_buffer(void);
void print_pgm_check_info(void);
unsigned long randomize_within_range(unsigned long size, unsigned long align,
unsigned long min, unsigned long max);
-void setup_vmem(unsigned long asce_limit);
+void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit);
void __printf(1, 2) decompressor_printk(const char *fmt, ...);
void print_stacktrace(unsigned long sp);
void error(char *m);
+int get_random(unsigned long limit, unsigned long *value);
extern struct machine_info machine;
@@ -98,6 +92,10 @@ extern struct vmlinux_info _vmlinux_info;
#define vmlinux _vmlinux_info
#define __abs_lowcore_pa(x) (((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore))
+#define __kernel_va(x) ((void *)((unsigned long)(x) - __kaslr_offset_phys + __kaslr_offset))
+#define __kernel_pa(x) ((unsigned long)(x) - __kaslr_offset + __kaslr_offset_phys)
+#define __identity_va(x) ((void *)((unsigned long)(x) + __identity_base))
+#define __identity_pa(x) ((unsigned long)(x) - __identity_base)
static inline bool intersects(unsigned long addr0, unsigned long size0,
unsigned long addr1, unsigned long size1)
diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c
index d762733a0753..f478e8e9cbda 100644
--- a/arch/s390/boot/decompressor.c
+++ b/arch/s390/boot/decompressor.c
@@ -63,24 +63,13 @@ static unsigned long free_mem_end_ptr = (unsigned long) _end + BOOT_HEAP_SIZE;
#include "../../../../lib/decompress_unzstd.c"
#endif
-#define decompress_offset ALIGN((unsigned long)_end + BOOT_HEAP_SIZE, PAGE_SIZE)
-
unsigned long mem_safe_offset(void)
{
- /*
- * due to 4MB HEAD_SIZE for bzip2
- * 'decompress_offset + vmlinux.image_size' could be larger than
- * kernel at final position + its .bss, so take the larger of two
- */
- return max(decompress_offset + vmlinux.image_size,
- vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size);
+ return ALIGN(free_mem_end_ptr, PAGE_SIZE);
}
-void *decompress_kernel(void)
+void deploy_kernel(void *output)
{
- void *output = (void *)decompress_offset;
-
__decompress(_compressed_start, _compressed_end - _compressed_start,
NULL, NULL, output, vmlinux.image_size, NULL, error);
- return output;
}
diff --git a/arch/s390/boot/decompressor.h b/arch/s390/boot/decompressor.h
index 92b81d2ea35d..4f966f06bd65 100644
--- a/arch/s390/boot/decompressor.h
+++ b/arch/s390/boot/decompressor.h
@@ -2,11 +2,9 @@
#ifndef BOOT_COMPRESSED_DECOMPRESSOR_H
#define BOOT_COMPRESSED_DECOMPRESSOR_H
-#ifdef CONFIG_KERNEL_UNCOMPRESSED
-static inline void *decompress_kernel(void) { return NULL; }
-#else
-void *decompress_kernel(void);
-#endif
+#ifndef CONFIG_KERNEL_UNCOMPRESSED
unsigned long mem_safe_offset(void);
+void deploy_kernel(void *output);
+#endif
#endif /* BOOT_COMPRESSED_DECOMPRESSOR_H */
diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c
index 90602101e2ae..bd3bf5ef472d 100644
--- a/arch/s390/boot/kaslr.c
+++ b/arch/s390/boot/kaslr.c
@@ -43,7 +43,7 @@ static int check_prng(void)
return PRNG_MODE_TDES;
}
-static int get_random(unsigned long limit, unsigned long *value)
+int get_random(unsigned long limit, unsigned long *value)
{
struct prng_parm prng = {
/* initial parameter block for tdes mode, copied from libica */
diff --git a/arch/s390/boot/pgm_check_info.c b/arch/s390/boot/pgm_check_info.c
index 97244cd7a206..ea96275b0380 100644
--- a/arch/s390/boot/pgm_check_info.c
+++ b/arch/s390/boot/pgm_check_info.c
@@ -153,8 +153,10 @@ void print_pgm_check_info(void)
decompressor_printk("Kernel command line: %s\n", early_command_line);
decompressor_printk("Kernel fault: interruption code %04x ilc:%x\n",
S390_lowcore.pgm_code, S390_lowcore.pgm_ilc >> 1);
- if (kaslr_enabled())
+ if (kaslr_enabled()) {
decompressor_printk("Kernel random base: %lx\n", __kaslr_offset);
+ decompressor_printk("Kernel random base phys: %lx\n", __kaslr_offset_phys);
+ }
decompressor_printk("PSW : %016lx %016lx (%pS)\n",
S390_lowcore.psw_save_area.mask,
S390_lowcore.psw_save_area.addr,
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index 6cf89314209a..467283b112cd 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -3,6 +3,7 @@
#include <linux/elf.h>
#include <asm/page-states.h>
#include <asm/boot_data.h>
+#include <asm/extmem.h>
#include <asm/sections.h>
#include <asm/maccess.h>
#include <asm/cpu_mf.h>
@@ -18,7 +19,7 @@
#include "boot.h"
#include "uv.h"
-unsigned long __bootdata_preserved(__kaslr_offset);
+struct vm_layout __bootdata_preserved(vm_layout);
unsigned long __bootdata_preserved(__abs_lowcore);
unsigned long __bootdata_preserved(__memcpy_real_area);
pte_t *__bootdata_preserved(memcpy_real_ptep);
@@ -29,7 +30,6 @@ unsigned long __bootdata_preserved(vmemmap_size);
unsigned long __bootdata_preserved(MODULES_VADDR);
unsigned long __bootdata_preserved(MODULES_END);
unsigned long __bootdata_preserved(max_mappable);
-unsigned long __bootdata(ident_map_size);
u64 __bootdata_preserved(stfle_fac_list[16]);
u64 __bootdata_preserved(alt_stfle_fac_list[16]);
@@ -109,9 +109,19 @@ static void setup_lpp(void)
}
#ifdef CONFIG_KERNEL_UNCOMPRESSED
-unsigned long mem_safe_offset(void)
+static unsigned long mem_safe_offset(void)
{
- return vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size;
+ return (unsigned long)_compressed_start;
+}
+
+static void deploy_kernel(void *output)
+{
+ void *uncompressed_start = (void *)_compressed_start;
+
+ if (output == uncompressed_start)
+ return;
+ memmove(output, uncompressed_start, vmlinux.image_size);
+ memset(uncompressed_start, 0, vmlinux.image_size);
}
#endif
@@ -141,70 +151,18 @@ static void copy_bootdata(void)
memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size);
}
-#ifdef CONFIG_PIE_BUILD
-static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset)
-{
- Elf64_Rela *rela_start, *rela_end, *rela;
- int r_type, r_sym, rc;
- Elf64_Addr loc, val;
- Elf64_Sym *dynsym;
-
- rela_start = (Elf64_Rela *) vmlinux.rela_dyn_start;
- rela_end = (Elf64_Rela *) vmlinux.rela_dyn_end;
- dynsym = (Elf64_Sym *) vmlinux.dynsym_start;
- for (rela = rela_start; rela < rela_end; rela++) {
- loc = rela->r_offset + offset;
- val = rela->r_addend;
- r_sym = ELF64_R_SYM(rela->r_info);
- if (r_sym) {
- if (dynsym[r_sym].st_shndx != SHN_UNDEF)
- val += dynsym[r_sym].st_value + offset;
- } else {
- /*
- * 0 == undefined symbol table index (STN_UNDEF),
- * used for R_390_RELATIVE, only add KASLR offset
- */
- val += offset;
- }
- r_type = ELF64_R_TYPE(rela->r_info);
- rc = arch_kexec_do_relocs(r_type, (void *) loc, val, 0);
- if (rc)
- error("Unknown relocation type");
- }
-}
-
-static void kaslr_adjust_got(unsigned long offset) {}
-static void rescue_relocs(void) {}
-static void free_relocs(void) {}
-#else
-static int *vmlinux_relocs_64_start;
-static int *vmlinux_relocs_64_end;
-
-static void rescue_relocs(void)
-{
- unsigned long size = __vmlinux_relocs_64_end - __vmlinux_relocs_64_start;
-
- vmlinux_relocs_64_start = (void *)physmem_alloc_top_down(RR_RELOC, size, 0);
- vmlinux_relocs_64_end = (void *)vmlinux_relocs_64_start + size;
- memmove(vmlinux_relocs_64_start, __vmlinux_relocs_64_start, size);
-}
-
-static void free_relocs(void)
-{
- physmem_free(RR_RELOC);
-}
-
-static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset)
+static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr,
+ unsigned long offset, unsigned long phys_offset)
{
int *reloc;
long loc;
/* Adjust R_390_64 relocations */
- for (reloc = vmlinux_relocs_64_start; reloc < vmlinux_relocs_64_end; reloc++) {
- loc = (long)*reloc + offset;
+ for (reloc = (int *)__vmlinux_relocs_64_start; reloc < (int *)__vmlinux_relocs_64_end; reloc++) {
+ loc = (long)*reloc + phys_offset;
if (loc < min_addr || loc > max_addr)
error("64-bit relocation outside of kernel!\n");
- *(u64 *)loc += offset;
+ *(u64 *)loc += offset - __START_KERNEL;
}
}
@@ -217,9 +175,8 @@ static void kaslr_adjust_got(unsigned long offset)
* reason. Adjust the GOT entries.
*/
for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++)
- *entry += offset;
+ *entry += offset - __START_KERNEL;
}
-#endif
/*
* Merge information from several sources into a single ident_map_size value.
@@ -261,9 +218,26 @@ static void setup_ident_map_size(unsigned long max_physmem_end)
#endif
}
-static unsigned long setup_kernel_memory_layout(void)
+#define FIXMAP_SIZE round_up(MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE, sizeof(struct lowcore))
+
+static unsigned long get_vmem_size(unsigned long identity_size,
+ unsigned long vmemmap_size,
+ unsigned long vmalloc_size,
+ unsigned long rte_size)
+{
+ unsigned long max_mappable, vsize;
+
+ max_mappable = max(identity_size, MAX_DCSS_ADDR);
+ vsize = round_up(SZ_2G + max_mappable, rte_size) +
+ round_up(vmemmap_size, rte_size) +
+ FIXMAP_SIZE + MODULES_LEN + KASLR_LEN;
+ return size_add(vsize, vmalloc_size);
+}
+
+static unsigned long setup_kernel_memory_layout(unsigned long kernel_size)
{
unsigned long vmemmap_start;
+ unsigned long kernel_start;
unsigned long asce_limit;
unsigned long rte_size;
unsigned long pages;
@@ -275,12 +249,19 @@ static unsigned long setup_kernel_memory_layout(void)
vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page);
/* choose kernel address space layout: 4 or 3 levels. */
- vsize = round_up(ident_map_size, _REGION3_SIZE) + vmemmap_size +
- MODULES_LEN + MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE;
- vsize = size_add(vsize, vmalloc_size);
- if (IS_ENABLED(CONFIG_KASAN) || (vsize > _REGION2_SIZE)) {
+ BUILD_BUG_ON(!IS_ALIGNED(__START_KERNEL, THREAD_SIZE));
+ BUILD_BUG_ON(!IS_ALIGNED(__NO_KASLR_START_KERNEL, THREAD_SIZE));
+ BUILD_BUG_ON(__NO_KASLR_END_KERNEL > _REGION1_SIZE);
+ vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION3_SIZE);
+ if (IS_ENABLED(CONFIG_KASAN) || __NO_KASLR_END_KERNEL > _REGION2_SIZE ||
+ (vsize > _REGION2_SIZE && kaslr_enabled())) {
asce_limit = _REGION1_SIZE;
- rte_size = _REGION2_SIZE;
+ if (__NO_KASLR_END_KERNEL > _REGION2_SIZE) {
+ rte_size = _REGION2_SIZE;
+ vsize = get_vmem_size(ident_map_size, vmemmap_size, vmalloc_size, _REGION2_SIZE);
+ } else {
+ rte_size = _REGION3_SIZE;
+ }
} else {
asce_limit = _REGION2_SIZE;
rte_size = _REGION3_SIZE;
@@ -290,38 +271,67 @@ static unsigned long setup_kernel_memory_layout(void)
* Forcing modules and vmalloc area under the ultravisor
* secure storage limit, so that any vmalloc allocation
* we do could be used to back secure guest storage.
+ *
+ * Assume the secure storage limit always exceeds _REGION2_SIZE,
+ * otherwise asce_limit and rte_size would have been adjusted.
*/
vmax = adjust_to_uv_max(asce_limit);
#ifdef CONFIG_KASAN
+ BUILD_BUG_ON(__NO_KASLR_END_KERNEL > KASAN_SHADOW_START);
/* force vmalloc and modules below kasan shadow */
vmax = min(vmax, KASAN_SHADOW_START);
#endif
- __memcpy_real_area = round_down(vmax - MEMCPY_REAL_SIZE, PAGE_SIZE);
- __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE,
- sizeof(struct lowcore));
- MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE);
+ vsize = min(vsize, vmax);
+ if (kaslr_enabled()) {
+ unsigned long kernel_end, kaslr_len, slots, pos;
+
+ kaslr_len = max(KASLR_LEN, vmax - vsize);
+ slots = DIV_ROUND_UP(kaslr_len - kernel_size, THREAD_SIZE);
+ if (get_random(slots, &pos))
+ pos = 0;
+ kernel_end = vmax - pos * THREAD_SIZE;
+ kernel_start = round_down(kernel_end - kernel_size, THREAD_SIZE);
+ } else if (vmax < __NO_KASLR_END_KERNEL || vsize > __NO_KASLR_END_KERNEL) {
+ kernel_start = round_down(vmax - kernel_size, THREAD_SIZE);
+ decompressor_printk("The kernel base address is forced to %lx\n", kernel_start);
+ } else {
+ kernel_start = __NO_KASLR_START_KERNEL;
+ }
+ __kaslr_offset = kernel_start;
+
+ MODULES_END = round_down(kernel_start, _SEGMENT_SIZE);
MODULES_VADDR = MODULES_END - MODULES_LEN;
VMALLOC_END = MODULES_VADDR;
/* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */
- vsize = round_down(VMALLOC_END / 2, _SEGMENT_SIZE);
+ vsize = (VMALLOC_END - FIXMAP_SIZE) / 2;
+ vsize = round_down(vsize, _SEGMENT_SIZE);
vmalloc_size = min(vmalloc_size, vsize);
VMALLOC_START = VMALLOC_END - vmalloc_size;
+ __memcpy_real_area = round_down(VMALLOC_START - MEMCPY_REAL_SIZE, PAGE_SIZE);
+ __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE,
+ sizeof(struct lowcore));
+
/* split remaining virtual space between 1:1 mapping & vmemmap array */
- pages = VMALLOC_START / (PAGE_SIZE + sizeof(struct page));
+ pages = __abs_lowcore / (PAGE_SIZE + sizeof(struct page));
pages = SECTION_ALIGN_UP(pages);
/* keep vmemmap_start aligned to a top level region table entry */
- vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size);
- vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS);
- /* maximum mappable address as seen by arch_get_mappable_range() */
- max_mappable = vmemmap_start;
+ vmemmap_start = round_down(__abs_lowcore - pages * sizeof(struct page), rte_size);
/* make sure identity map doesn't overlay with vmemmap */
ident_map_size = min(ident_map_size, vmemmap_start);
vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page);
- /* make sure vmemmap doesn't overlay with vmalloc area */
- VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START);
+ /* make sure vmemmap doesn't overlay with absolute lowcore area */
+ if (vmemmap_start + vmemmap_size > __abs_lowcore) {
+ vmemmap_size = SECTION_ALIGN_DOWN(ident_map_size / PAGE_SIZE) * sizeof(struct page);
+ ident_map_size = vmemmap_size / sizeof(struct page) * PAGE_SIZE;
+ }
vmemmap = (struct page *)vmemmap_start;
+ /* maximum address for which linear mapping could be created (DCSS, memory) */
+ BUILD_BUG_ON(MAX_DCSS_ADDR > (1UL << MAX_PHYSMEM_BITS));
+ max_mappable = max(ident_map_size, MAX_DCSS_ADDR);
+ max_mappable = min(max_mappable, vmemmap_start);
+ __identity_base = round_down(vmemmap_start - max_mappable, rte_size);
return asce_limit;
}
@@ -329,9 +339,9 @@ static unsigned long setup_kernel_memory_layout(void)
/*
* This function clears the BSS section of the decompressed Linux kernel and NOT the decompressor's.
*/
-static void clear_bss_section(unsigned long vmlinux_lma)
+static void clear_bss_section(unsigned long kernel_start)
{
- memset((void *)vmlinux_lma + vmlinux.image_size, 0, vmlinux.bss_size);
+ memset((void *)kernel_start + vmlinux.image_size, 0, vmlinux.bss_size);
}
/*
@@ -348,19 +358,12 @@ static void setup_vmalloc_size(void)
vmalloc_size = max(size, vmalloc_size);
}
-static void kaslr_adjust_vmlinux_info(unsigned long offset)
+static void kaslr_adjust_vmlinux_info(long offset)
{
- *(unsigned long *)(&vmlinux.entry) += offset;
vmlinux.bootdata_off += offset;
vmlinux.bootdata_preserved_off += offset;
-#ifdef CONFIG_PIE_BUILD
- vmlinux.rela_dyn_start += offset;
- vmlinux.rela_dyn_end += offset;
- vmlinux.dynsym_start += offset;
-#else
vmlinux.got_start += offset;
vmlinux.got_end += offset;
-#endif
vmlinux.init_mm_off += offset;
vmlinux.swapper_pg_dir_off += offset;
vmlinux.invalid_pg_dir_off += offset;
@@ -373,23 +376,30 @@ static void kaslr_adjust_vmlinux_info(unsigned long offset)
#endif
}
+static void fixup_vmlinux_info(void)
+{
+ vmlinux.entry -= __START_KERNEL;
+ kaslr_adjust_vmlinux_info(-__START_KERNEL);
+}
+
void startup_kernel(void)
{
- unsigned long max_physmem_end;
- unsigned long vmlinux_lma = 0;
+ unsigned long kernel_size = vmlinux.image_size + vmlinux.bss_size;
+ unsigned long nokaslr_offset_phys = mem_safe_offset();
unsigned long amode31_lma = 0;
+ unsigned long max_physmem_end;
unsigned long asce_limit;
unsigned long safe_addr;
- void *img;
psw_t psw;
+ fixup_vmlinux_info();
setup_lpp();
- safe_addr = mem_safe_offset();
+ safe_addr = PAGE_ALIGN(nokaslr_offset_phys + kernel_size);
/*
- * Reserve decompressor memory together with decompression heap, buffer and
- * memory which might be occupied by uncompressed kernel at default 1Mb
- * position (if KASLR is off or failed).
+ * Reserve decompressor memory together with decompression heap,
+ * buffer and memory which might be occupied by uncompressed kernel
+ * (if KASLR is off or failed).
*/
physmem_reserve(RR_DECOMPRESSOR, 0, safe_addr);
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && parmarea.initrd_size)
@@ -409,40 +419,39 @@ void startup_kernel(void)
max_physmem_end = detect_max_physmem_end();
setup_ident_map_size(max_physmem_end);
setup_vmalloc_size();
- asce_limit = setup_kernel_memory_layout();
+ asce_limit = setup_kernel_memory_layout(kernel_size);
/* got final ident_map_size, physmem allocations could be performed now */
physmem_set_usable_limit(ident_map_size);
detect_physmem_online_ranges(max_physmem_end);
save_ipl_cert_comp_list();
rescue_initrd(safe_addr, ident_map_size);
- rescue_relocs();
- if (kaslr_enabled()) {
- vmlinux_lma = randomize_within_range(vmlinux.image_size + vmlinux.bss_size,
- THREAD_SIZE, vmlinux.default_lma,
- ident_map_size);
- if (vmlinux_lma) {
- __kaslr_offset = vmlinux_lma - vmlinux.default_lma;
- kaslr_adjust_vmlinux_info(__kaslr_offset);
- }
- }
- vmlinux_lma = vmlinux_lma ?: vmlinux.default_lma;
- physmem_reserve(RR_VMLINUX, vmlinux_lma, vmlinux.image_size + vmlinux.bss_size);
-
- if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) {
- img = decompress_kernel();
- memmove((void *)vmlinux_lma, img, vmlinux.image_size);
- } else if (__kaslr_offset) {
- img = (void *)vmlinux.default_lma;
- memmove((void *)vmlinux_lma, img, vmlinux.image_size);
- memset(img, 0, vmlinux.image_size);
- }
+ if (kaslr_enabled())
+ __kaslr_offset_phys = randomize_within_range(kernel_size, THREAD_SIZE, 0, ident_map_size);
+ if (!__kaslr_offset_phys)
+ __kaslr_offset_phys = nokaslr_offset_phys;
+ kaslr_adjust_vmlinux_info(__kaslr_offset_phys);
+ physmem_reserve(RR_VMLINUX, __kaslr_offset_phys, kernel_size);
+ deploy_kernel((void *)__kaslr_offset_phys);
/* vmlinux decompression is done, shrink reserved low memory */
physmem_reserve(RR_DECOMPRESSOR, 0, (unsigned long)_decompressor_end);
+
+ /*
+ * In case KASLR is enabled the randomized location of .amode31
+ * section might overlap with .vmlinux.relocs section. To avoid that
+ * the below randomize_within_range() could have been called with
+ * __vmlinux_relocs_64_end as the lower range address. However,
+ * .amode31 section is written to by the decompressed kernel - at
+ * that time the contents of .vmlinux.relocs is not needed anymore.
+ * Conversly, .vmlinux.relocs is read only by the decompressor, even
+ * before the kernel started. Therefore, in case the two sections
+ * overlap there is no risk of corrupting any data.
+ */
if (kaslr_enabled())
amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, 0, SZ_2G);
- amode31_lma = amode31_lma ?: vmlinux.default_lma - vmlinux.amode31_size;
+ if (!amode31_lma)
+ amode31_lma = __kaslr_offset_phys - vmlinux.amode31_size;
physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size);
/*
@@ -458,23 +467,23 @@ void startup_kernel(void)
* - copy_bootdata() must follow setup_vmem() to propagate changes
* to bootdata made by setup_vmem()
*/
- clear_bss_section(vmlinux_lma);
- kaslr_adjust_relocs(vmlinux_lma, vmlinux_lma + vmlinux.image_size, __kaslr_offset);
+ clear_bss_section(__kaslr_offset_phys);
+ kaslr_adjust_relocs(__kaslr_offset_phys, __kaslr_offset_phys + vmlinux.image_size,
+ __kaslr_offset, __kaslr_offset_phys);
kaslr_adjust_got(__kaslr_offset);
- free_relocs();
- setup_vmem(asce_limit);
+ setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit);
copy_bootdata();
/*
* Save KASLR offset for early dumps, before vmcore_info is set.
* Mark as uneven to distinguish from real vmcore_info pointer.
*/
- S390_lowcore.vmcore_info = __kaslr_offset ? __kaslr_offset | 0x1UL : 0;
+ S390_lowcore.vmcore_info = __kaslr_offset_phys ? __kaslr_offset_phys | 0x1UL : 0;
/*
* Jump to the decompressed kernel entry point and switch DAT mode on.
*/
- psw.addr = vmlinux.entry;
+ psw.addr = __kaslr_offset + vmlinux.entry;
psw.mask = PSW_KERNEL_BITS;
__load_psw(psw);
}
diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c
index 09b10bb6e4d0..96d48b7112d4 100644
--- a/arch/s390/boot/vmem.c
+++ b/arch/s390/boot/vmem.c
@@ -27,6 +27,8 @@ enum populate_mode {
POPULATE_NONE,
POPULATE_DIRECT,
POPULATE_ABS_LOWCORE,
+ POPULATE_IDENTITY,
+ POPULATE_KERNEL,
#ifdef CONFIG_KASAN
POPULATE_KASAN_MAP_SHADOW,
POPULATE_KASAN_ZERO_SHADOW,
@@ -54,7 +56,7 @@ static inline void kasan_populate(unsigned long start, unsigned long end, enum p
pgtable_populate(start, end, mode);
}
-static void kasan_populate_shadow(void)
+static void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end)
{
pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY);
pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY);
@@ -76,44 +78,20 @@ static void kasan_populate_shadow(void)
__arch_set_page_dat(kasan_early_shadow_pmd, 1UL << CRST_ALLOC_ORDER);
__arch_set_page_dat(kasan_early_shadow_pte, 1);
- /*
- * Current memory layout:
- * +- 0 -------------+ +- shadow start -+
- * |1:1 ident mapping| /|1/8 of ident map|
- * | | / | |
- * +-end of ident map+ / +----------------+
- * | ... gap ... | / | kasan |
- * | | / | zero page |
- * +- vmalloc area -+ / | mapping |
- * | vmalloc_size | / | (untracked) |
- * +- modules vaddr -+ / +----------------+
- * | 2Gb |/ | unmapped | allocated per module
- * +- shadow start -+ +----------------+
- * | 1/8 addr space | | zero pg mapping| (untracked)
- * +- shadow end ----+---------+- shadow end ---+
- *
- * Current memory layout (KASAN_VMALLOC):
- * +- 0 -------------+ +- shadow start -+
- * |1:1 ident mapping| /|1/8 of ident map|
- * | | / | |
- * +-end of ident map+ / +----------------+
- * | ... gap ... | / | kasan zero page| (untracked)
- * | | / | mapping |
- * +- vmalloc area -+ / +----------------+
- * | vmalloc_size | / |shallow populate|
- * +- modules vaddr -+ / +----------------+
- * | 2Gb |/ |shallow populate|
- * +- shadow start -+ +----------------+
- * | 1/8 addr space | | zero pg mapping| (untracked)
- * +- shadow end ----+---------+- shadow end ---+
- */
-
for_each_physmem_usable_range(i, &start, &end) {
- kasan_populate(start, end, POPULATE_KASAN_MAP_SHADOW);
- if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260)
- kasan_populate(memgap_start, start, POPULATE_KASAN_ZERO_SHADOW);
+ kasan_populate((unsigned long)__identity_va(start),
+ (unsigned long)__identity_va(end),
+ POPULATE_KASAN_MAP_SHADOW);
+ if (memgap_start && physmem_info.info_source == MEM_DETECT_DIAG260) {
+ kasan_populate((unsigned long)__identity_va(memgap_start),
+ (unsigned long)__identity_va(start),
+ POPULATE_KASAN_ZERO_SHADOW);
+ }
memgap_start = end;
}
+ kasan_populate(kernel_start, kernel_end, POPULATE_KASAN_MAP_SHADOW);
+ kasan_populate(0, (unsigned long)__identity_va(0), POPULATE_KASAN_ZERO_SHADOW);
+ kasan_populate(AMODE31_START, AMODE31_END, POPULATE_KASAN_ZERO_SHADOW);
if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
untracked_end = VMALLOC_START;
/* shallowly populate kasan shadow for vmalloc and modules */
@@ -122,8 +100,9 @@ static void kasan_populate_shadow(void)
untracked_end = MODULES_VADDR;
}
/* populate kasan shadow for untracked memory */
- kasan_populate(ident_map_size, untracked_end, POPULATE_KASAN_ZERO_SHADOW);
- kasan_populate(MODULES_END, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW);
+ kasan_populate((unsigned long)__identity_va(ident_map_size), untracked_end,
+ POPULATE_KASAN_ZERO_SHADOW);
+ kasan_populate(kernel_end, _REGION1_SIZE, POPULATE_KASAN_ZERO_SHADOW);
}
static bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr,
@@ -180,7 +159,9 @@ static bool kasan_pte_populate_zero_shadow(pte_t *pte, enum populate_mode mode)
}
#else
-static inline void kasan_populate_shadow(void) {}
+static inline void kasan_populate_shadow(unsigned long kernel_start, unsigned long kernel_end)
+{
+}
static inline bool kasan_pgd_populate_zero_shadow(pgd_t *pgd, unsigned long addr,
unsigned long end, enum populate_mode mode)
@@ -263,6 +244,10 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m
return addr;
case POPULATE_ABS_LOWCORE:
return __abs_lowcore_pa(addr);
+ case POPULATE_KERNEL:
+ return __kernel_pa(addr);
+ case POPULATE_IDENTITY:
+ return __identity_pa(addr);
#ifdef CONFIG_KASAN
case POPULATE_KASAN_MAP_SHADOW:
addr = physmem_alloc_top_down(RR_VMEM, size, size);
@@ -274,15 +259,22 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m
}
}
-static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end)
+static bool large_allowed(enum populate_mode mode)
+{
+ return (mode == POPULATE_DIRECT) || (mode == POPULATE_IDENTITY);
+}
+
+static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end,
+ enum populate_mode mode)
{
- return machine.has_edat2 &&
+ return machine.has_edat2 && large_allowed(mode) &&
IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE;
}
-static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end)
+static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end,
+ enum populate_mode mode)
{
- return machine.has_edat1 &&
+ return machine.has_edat1 && large_allowed(mode) &&
IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE;
}
@@ -322,7 +314,7 @@ static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long e
if (pmd_none(*pmd)) {
if (kasan_pmd_populate_zero_shadow(pmd, addr, next, mode))
continue;
- if (can_large_pmd(pmd, addr, next)) {
+ if (can_large_pmd(pmd, addr, next, mode)) {
entry = __pmd(_pa(addr, _SEGMENT_SIZE, mode));
entry = set_pmd_bit(entry, SEGMENT_KERNEL);
if (!machine.has_nx)
@@ -355,7 +347,7 @@ static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long e
if (pud_none(*pud)) {
if (kasan_pud_populate_zero_shadow(pud, addr, next, mode))
continue;
- if (can_large_pud(pud, addr, next)) {
+ if (can_large_pud(pud, addr, next, mode)) {
entry = __pud(_pa(addr, _REGION3_SIZE, mode));
entry = set_pud_bit(entry, REGION3_KERNEL);
if (!machine.has_nx)
@@ -418,11 +410,12 @@ static void pgtable_populate(unsigned long addr, unsigned long end, enum populat
}
}
-void setup_vmem(unsigned long asce_limit)
+void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit)
{
unsigned long start, end;
unsigned long asce_type;
unsigned long asce_bits;
+ pgd_t *init_mm_pgd;
int i;
/*
@@ -433,6 +426,15 @@ void setup_vmem(unsigned long asce_limit)
for_each_physmem_online_range(i, &start, &end)
__arch_set_page_nodat((void *)start, (end - start) >> PAGE_SHIFT);
+ /*
+ * init_mm->pgd contains virtual address of swapper_pg_dir.
+ * It is unusable at this stage since DAT is yet off. Swap
+ * it for physical address of swapper_pg_dir and restore
+ * the virtual address after all page tables are created.
+ */
+ init_mm_pgd = init_mm.pgd;
+ init_mm.pgd = (pgd_t *)swapper_pg_dir;
+
if (asce_limit == _REGION1_SIZE) {
asce_type = _REGION2_ENTRY_EMPTY;
asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
@@ -453,15 +455,20 @@ void setup_vmem(unsigned long asce_limit)
* the lowcore and create the identity mapping only afterwards.
*/
pgtable_populate(0, sizeof(struct lowcore), POPULATE_DIRECT);
- for_each_physmem_usable_range(i, &start, &end)
- pgtable_populate(start, end, POPULATE_DIRECT);
+ for_each_physmem_usable_range(i, &start, &end) {
+ pgtable_populate((unsigned long)__identity_va(start),
+ (unsigned long)__identity_va(end),
+ POPULATE_IDENTITY);
+ }
+ pgtable_populate(kernel_start, kernel_end, POPULATE_KERNEL);
+ pgtable_populate(AMODE31_START, AMODE31_END, POPULATE_DIRECT);
pgtable_populate(__abs_lowcore, __abs_lowcore + sizeof(struct lowcore),
POPULATE_ABS_LOWCORE);
pgtable_populate(__memcpy_real_area, __memcpy_real_area + PAGE_SIZE,
POPULATE_NONE);
- memcpy_real_ptep = __virt_to_kpte(__memcpy_real_area);
+ memcpy_real_ptep = __identity_va(__virt_to_kpte(__memcpy_real_area));
- kasan_populate_shadow();
+ kasan_populate_shadow(kernel_start, kernel_end);
S390_lowcore.kernel_asce.val = swapper_pg_dir | asce_bits;
S390_lowcore.user_asce = s390_invalid_asce;
@@ -471,4 +478,5 @@ void setup_vmem(unsigned long asce_limit)
local_ctl_load(13, &S390_lowcore.kernel_asce);
init_mm.context.asce = S390_lowcore.kernel_asce.val;
+ init_mm.pgd = init_mm_pgd;
}
diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S
index 3d7ea585ab99..1fe5a1d3ff60 100644
--- a/arch/s390/boot/vmlinux.lds.S
+++ b/arch/s390/boot/vmlinux.lds.S
@@ -99,8 +99,16 @@ SECTIONS
_decompressor_end = .;
+ . = ALIGN(4);
+ .vmlinux.relocs : {
+ __vmlinux_relocs_64_start = .;
+ *(.vmlinux.relocs_64)
+ __vmlinux_relocs_64_end = .;
+ }
+
#ifdef CONFIG_KERNEL_UNCOMPRESSED
- . = 0x100000;
+ . = ALIGN(PAGE_SIZE);
+ . += AMODE31_SIZE; /* .amode31 section */
#else
. = ALIGN(8);
#endif
@@ -110,24 +118,6 @@ SECTIONS
_compressed_end = .;
}
-#ifndef CONFIG_PIE_BUILD
- /*
- * When the kernel is built with CONFIG_KERNEL_UNCOMPRESSED, the entire
- * uncompressed vmlinux.bin is positioned in the bzImage decompressor
- * image at the default kernel LMA of 0x100000, enabling it to be
- * executed in-place. However, the size of .vmlinux.relocs could be
- * large enough to cause an overlap with the uncompressed kernel at the
- * address 0x100000. To address this issue, .vmlinux.relocs is
- * positioned after the .rodata.compressed.
- */
- . = ALIGN(4);
- .vmlinux.relocs : {
- __vmlinux_relocs_64_start = .;
- *(.vmlinux.relocs_64)
- __vmlinux_relocs_64_end = .;
- }
-#endif
-
#define SB_TRAILER_SIZE 32
/* Trailer needed for Secure Boot */
. += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */
diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h
index 43ac4a64f49b..395b02d6a133 100644
--- a/arch/s390/include/asm/ap.h
+++ b/arch/s390/include/asm/ap.h
@@ -223,13 +223,18 @@ static inline struct ap_queue_status ap_zapq(ap_qid_t qid, int fbit)
* config info as returned by the ap_qci() function.
*/
struct ap_config_info {
- unsigned int apsc : 1; /* S bit */
- unsigned int apxa : 1; /* N bit */
- unsigned int qact : 1; /* C bit */
- unsigned int rc8a : 1; /* R bit */
- unsigned int : 4;
- unsigned int apsb : 1; /* B bit */
- unsigned int : 23;
+ union {
+ unsigned int flags;
+ struct {
+ unsigned int apsc : 1; /* S bit */
+ unsigned int apxa : 1; /* N bit */
+ unsigned int qact : 1; /* C bit */
+ unsigned int rc8a : 1; /* R bit */
+ unsigned int : 4;
+ unsigned int apsb : 1; /* B bit */
+ unsigned int : 23;
+ };
+ };
unsigned char na; /* max # of APs - 1 */
unsigned char nd; /* max # of Domains - 1 */
unsigned char _reserved0[10];
@@ -544,15 +549,4 @@ static inline struct ap_queue_status ap_dqap(ap_qid_t qid,
return reg1.status;
}
-/*
- * Interface to tell the AP bus code that a configuration
- * change has happened. The bus code should at least do
- * an ap bus resource rescan.
- */
-#if IS_ENABLED(CONFIG_ZCRYPT)
-void ap_bus_cfg_chg(void);
-#else
-static inline void ap_bus_cfg_chg(void){}
-#endif
-
#endif /* _ASM_S390_AP_H_ */
diff --git a/arch/s390/include/asm/asm-prototypes.h b/arch/s390/include/asm/asm-prototypes.h
index 56096ae26f29..f662eb4b9246 100644
--- a/arch/s390/include/asm/asm-prototypes.h
+++ b/arch/s390/include/asm/asm-prototypes.h
@@ -4,6 +4,7 @@
#include <linux/kvm_host.h>
#include <linux/ftrace.h>
#include <asm/fpu.h>
+#include <asm/nospec-branch.h>
#include <asm-generic/asm-prototypes.h>
__int128_t __ashlti3(__int128_t a, int b);
diff --git a/arch/s390/include/asm/chsc.h b/arch/s390/include/asm/chsc.h
index bb48ea380c0d..bb78159d8042 100644
--- a/arch/s390/include/asm/chsc.h
+++ b/arch/s390/include/asm/chsc.h
@@ -11,6 +11,9 @@
#include <uapi/asm/chsc.h>
+/* struct from linux/notifier.h */
+struct notifier_block;
+
/**
* Operation codes for CHSC PNSO:
* PNSO_OC_NET_BRIDGE_INFO - only addresses that are visible to a bridgeport
@@ -66,4 +69,16 @@ struct chsc_pnso_area {
struct chsc_pnso_naid_l2 entries[];
} __packed __aligned(PAGE_SIZE);
+/*
+ * notifier interface - registered notifiers gets called on
+ * the following events:
+ * - ap config changed (CHSC_NOTIFY_AP_CFG)
+ */
+enum chsc_notify_type {
+ CHSC_NOTIFY_AP_CFG = 3,
+};
+
+int chsc_notifier_register(struct notifier_block *nb);
+int chsc_notifier_unregister(struct notifier_block *nb);
+
#endif /* _ASM_S390_CHSC_H */
diff --git a/arch/s390/include/asm/extmem.h b/arch/s390/include/asm/extmem.h
index 568fd81bb77b..e0a06060afdd 100644
--- a/arch/s390/include/asm/extmem.h
+++ b/arch/s390/include/asm/extmem.h
@@ -8,6 +8,13 @@
#define _ASM_S390X_DCSS_H
#ifndef __ASSEMBLY__
+/*
+ * DCSS segment is defined as a contiguous range of pages using DEFSEG command.
+ * The range start and end is a page number with a value less than or equal to
+ * 0x7ffffff (see CP Commands and Utilities Reference).
+ */
+#define MAX_DCSS_ADDR (512UL * SZ_1G)
+
/* possible values for segment type as returned by segment_info */
#define SEG_TYPE_SW 0
#define SEG_TYPE_EW 1
diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h
index 621f23d5ae30..77e479d44f1e 100644
--- a/arch/s390/include/asm/ftrace.h
+++ b/arch/s390/include/asm/ftrace.h
@@ -8,12 +8,8 @@
#ifndef __ASSEMBLY__
-#ifdef CONFIG_CC_IS_CLANG
-/* https://llvm.org/pr41424 */
-#define ftrace_return_address(n) 0UL
-#else
-#define ftrace_return_address(n) __builtin_return_address(n)
-#endif
+unsigned long return_address(unsigned int n);
+#define ftrace_return_address(n) return_address(n)
void ftrace_caller(void);
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 5cc46e0dde62..9725586f4259 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -146,7 +146,7 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start,
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
unsigned long gaddr, unsigned long vmaddr);
-int gmap_mark_unmergeable(void);
+int s390_disable_cow_sharing(void);
void s390_unlist_old_asce(struct gmap *gmap);
int s390_replace_asce(struct gmap *gmap);
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index bb1b4bef1878..4c2dc7abc285 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -32,6 +32,11 @@ typedef struct {
unsigned int uses_skeys:1;
/* The mmu context uses CMM. */
unsigned int uses_cmm:1;
+ /*
+ * The mmu context allows COW-sharing of memory pages (KSM, zeropage).
+ * Note that COW-sharing during fork() is currently always allowed.
+ */
+ unsigned int allow_cow_sharing:1;
/* The gmaps associated with this context are allowed to use huge pages. */
unsigned int allow_gmap_hpage_1m:1;
} mm_context_t;
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 929af18b0908..a7789a9f6218 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -35,6 +35,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.has_pgste = 0;
mm->context.uses_skeys = 0;
mm->context.uses_cmm = 0;
+ mm->context.allow_cow_sharing = 1;
mm->context.allow_gmap_hpage_1m = 0;
#endif
switch (mm->context.asce_limit) {
diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h
index 82725cf783c7..b9c1f3cae842 100644
--- a/arch/s390/include/asm/nospec-branch.h
+++ b/arch/s390/include/asm/nospec-branch.h
@@ -17,6 +17,26 @@ static inline bool nospec_uses_trampoline(void)
return __is_defined(CC_USING_EXPOLINE) && !nospec_disable;
}
+#ifdef CONFIG_EXPOLINE_EXTERN
+
+void __s390_indirect_jump_r1(void);
+void __s390_indirect_jump_r2(void);
+void __s390_indirect_jump_r3(void);
+void __s390_indirect_jump_r4(void);
+void __s390_indirect_jump_r5(void);
+void __s390_indirect_jump_r6(void);
+void __s390_indirect_jump_r7(void);
+void __s390_indirect_jump_r8(void);
+void __s390_indirect_jump_r9(void);
+void __s390_indirect_jump_r10(void);
+void __s390_indirect_jump_r11(void);
+void __s390_indirect_jump_r12(void);
+void __s390_indirect_jump_r13(void);
+void __s390_indirect_jump_r14(void);
+void __s390_indirect_jump_r15(void);
+
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_EXPOLINE_H */
diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h
index 7a946c42ad13..cb15dd25bf21 100644
--- a/arch/s390/include/asm/nospec-insn.h
+++ b/arch/s390/include/asm/nospec-insn.h
@@ -16,24 +16,25 @@
*/
.macro __THUNK_PROLOG_NAME name
#ifdef CONFIG_EXPOLINE_EXTERN
- .pushsection .text,"ax",@progbits
- __ALIGN
+ SYM_CODE_START(\name)
#else
.pushsection .text.\name,"axG",@progbits,\name,comdat
-#endif
.globl \name
.hidden \name
.type \name,@function
\name:
CFI_STARTPROC
+#endif
.endm
.macro __THUNK_EPILOG_NAME name
- CFI_ENDPROC
#ifdef CONFIG_EXPOLINE_EXTERN
- .size \name, .-\name
-#endif
+ SYM_CODE_END(\name)
+ EXPORT_SYMBOL(\name)
+#else
+ CFI_ENDPROC
.popsection
+#endif
.endm
.macro __THUNK_PROLOG_BR r1
diff --git a/arch/s390/include/asm/os_info.h b/arch/s390/include/asm/os_info.h
index a4d2e103f116..3ee9e8f5ceae 100644
--- a/arch/s390/include/asm/os_info.h
+++ b/arch/s390/include/asm/os_info.h
@@ -17,11 +17,25 @@
#define OS_INFO_VMCOREINFO 0
#define OS_INFO_REIPL_BLOCK 1
#define OS_INFO_FLAGS_ENTRY 2
+#define OS_INFO_RESERVED 3
+#define OS_INFO_IDENTITY_BASE 4
+#define OS_INFO_KASLR_OFFSET 5
+#define OS_INFO_KASLR_OFF_PHYS 6
+#define OS_INFO_VMEMMAP 7
+#define OS_INFO_AMODE31_START 8
+#define OS_INFO_AMODE31_END 9
+#define OS_INFO_IMAGE_START 10
+#define OS_INFO_IMAGE_END 11
+#define OS_INFO_IMAGE_PHYS 12
+#define OS_INFO_MAX 13
#define OS_INFO_FLAG_REIPL_CLEAR (1UL << 0)
struct os_info_entry {
- u64 addr;
+ union {
+ u64 addr;
+ u64 val;
+ };
u64 size;
u32 csum;
} __packed;
@@ -33,17 +47,24 @@ struct os_info {
u16 version_minor;
u64 crashkernel_addr;
u64 crashkernel_size;
- struct os_info_entry entry[3];
- u8 reserved[4004];
+ struct os_info_entry entry[OS_INFO_MAX];
+ u8 reserved[3804];
} __packed;
void os_info_init(void);
-void os_info_entry_add(int nr, void *ptr, u64 len);
+void os_info_entry_add_data(int nr, void *ptr, u64 len);
+void os_info_entry_add_val(int nr, u64 val);
void os_info_crashkernel_add(unsigned long base, unsigned long size);
u32 os_info_csum(struct os_info *os_info);
#ifdef CONFIG_CRASH_DUMP
void *os_info_old_entry(int nr, unsigned long *size);
+static inline unsigned long os_info_old_value(int nr)
+{
+ unsigned long size;
+
+ return (unsigned long)os_info_old_entry(nr, &size);
+}
#else
static inline void *os_info_old_entry(int nr, unsigned long *size)
{
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 9381879f7ecf..224ff9d433ea 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -178,19 +178,52 @@ int arch_make_page_accessible(struct page *page);
#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
#endif
-#define __PAGE_OFFSET 0x0UL
-#define PAGE_OFFSET 0x0UL
+struct vm_layout {
+ unsigned long kaslr_offset;
+ unsigned long kaslr_offset_phys;
+ unsigned long identity_base;
+ unsigned long identity_size;
+};
-#define __pa_nodebug(x) ((unsigned long)(x))
+extern struct vm_layout vm_layout;
+
+#define __kaslr_offset vm_layout.kaslr_offset
+#define __kaslr_offset_phys vm_layout.kaslr_offset_phys
+#define __identity_base vm_layout.identity_base
+#define ident_map_size vm_layout.identity_size
+
+static inline unsigned long kaslr_offset(void)
+{
+ return __kaslr_offset;
+}
+
+extern int __kaslr_enabled;
+static inline int kaslr_enabled(void)
+{
+ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
+ return __kaslr_enabled;
+ return 0;
+}
+
+#define __PAGE_OFFSET __identity_base
+#define PAGE_OFFSET __PAGE_OFFSET
#ifdef __DECOMPRESSOR
+#define __pa_nodebug(x) ((unsigned long)(x))
#define __pa(x) __pa_nodebug(x)
#define __pa32(x) __pa(x)
#define __va(x) ((void *)(unsigned long)(x))
#else /* __DECOMPRESSOR */
+static inline unsigned long __pa_nodebug(unsigned long x)
+{
+ if (x < __kaslr_offset)
+ return x - __identity_base;
+ return x - __kaslr_offset + __kaslr_offset_phys;
+}
+
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x, bool is_31bit);
@@ -206,7 +239,7 @@ static inline unsigned long __phys_addr(unsigned long x, bool is_31bit)
#define __pa(x) __phys_addr((unsigned long)(x), false)
#define __pa32(x) __phys_addr((unsigned long)(x), true)
-#define __va(x) ((void *)(unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x) + __identity_base))
#endif /* __DECOMPRESSOR */
@@ -231,7 +264,7 @@ static inline unsigned long virt_to_pfn(const void *kaddr)
#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr))
#define page_to_virt(page) pfn_to_virt(page_to_pfn(page))
-#define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug(kaddr)))
+#define virt_addr_valid(kaddr) pfn_valid(phys_to_pfn(__pa_nodebug((unsigned long)(kaddr))))
#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC
@@ -240,4 +273,11 @@ static inline unsigned long virt_to_pfn(const void *kaddr)
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
+#define AMODE31_SIZE (3 * PAGE_SIZE)
+
+#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
+#define __START_KERNEL 0x100000
+#define __NO_KASLR_START_KERNEL CONFIG_KERNEL_IMAGE_BASE
+#define __NO_KASLR_END_KERNEL (__NO_KASLR_START_KERNEL + KERNEL_IMAGE_SIZE)
+
#endif /* _S390_PAGE_H */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 60950e7a25f5..6f11d063d545 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -107,6 +107,12 @@ static inline int is_module_addr(void *addr)
return 1;
}
+#ifdef CONFIG_RANDOMIZE_BASE
+#define KASLR_LEN (1UL << 31)
+#else
+#define KASLR_LEN 0UL
+#endif
+
/*
* A 64 bit pagetable entry of S390 has following format:
* | PFRA |0IPC| OS |
@@ -566,10 +572,20 @@ static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot)
}
/*
- * In the case that a guest uses storage keys
- * faults should no longer be backed by zero pages
+ * As soon as the guest uses storage keys or enables PV, we deduplicate all
+ * mapped shared zeropages and prevent new shared zeropages from getting
+ * mapped.
*/
-#define mm_forbids_zeropage mm_has_pgste
+#define mm_forbids_zeropage mm_forbids_zeropage
+static inline int mm_forbids_zeropage(struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+ if (!mm->context.allow_cow_sharing)
+ return 1;
+#endif
+ return 0;
+}
+
static inline int mm_uses_skeys(struct mm_struct *mm)
{
#ifdef CONFIG_PGSTE
diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h
index e747b067f8db..f45cfc8bc233 100644
--- a/arch/s390/include/asm/physmem_info.h
+++ b/arch/s390/include/asm/physmem_info.h
@@ -22,7 +22,6 @@ enum reserved_range_type {
RR_DECOMPRESSOR,
RR_INITRD,
RR_VMLINUX,
- RR_RELOC,
RR_AMODE31,
RR_IPLREPORT,
RR_CERT_COMP_LIST,
@@ -170,4 +169,7 @@ static inline unsigned long get_physmem_reserved(enum reserved_range_type type,
return *size;
}
+#define AMODE31_START (physmem_info.reserved[RR_AMODE31].start)
+#define AMODE31_END (physmem_info.reserved[RR_AMODE31].end)
+
#endif
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index 03bcaa8effb2..32f70873e2b7 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -127,20 +127,6 @@ extern void (*_machine_restart)(char *command);
extern void (*_machine_halt)(void);
extern void (*_machine_power_off)(void);
-extern unsigned long __kaslr_offset;
-static inline unsigned long kaslr_offset(void)
-{
- return __kaslr_offset;
-}
-
-extern int __kaslr_enabled;
-static inline int kaslr_enabled(void)
-{
- if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
- return __kaslr_enabled;
- return 0;
-}
-
struct oldmem_data {
unsigned long start;
unsigned long size;
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index fa029d0dc28f..db2d9ba5a86d 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -11,6 +11,8 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE)
# Do not trace early setup code
CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_stacktrace.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_unwind_bc.o = $(CC_FLAGS_FTRACE)
endif
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index d09ebb6f5262..9863ebe75019 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -465,7 +465,11 @@ static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt)
ehdr->e_phoff = sizeof(Elf64_Ehdr);
ehdr->e_ehsize = sizeof(Elf64_Ehdr);
ehdr->e_phentsize = sizeof(Elf64_Phdr);
- ehdr->e_phnum = mem_chunk_cnt + 1;
+ /*
+ * Number of memory chunk PT_LOAD program headers plus one kernel
+ * image PT_LOAD program header plus one PT_NOTE program header.
+ */
+ ehdr->e_phnum = mem_chunk_cnt + 1 + 1;
return ehdr + 1;
}
@@ -501,15 +505,16 @@ static int get_mem_chunk_cnt(void)
*/
static void loads_init(Elf64_Phdr *phdr)
{
+ unsigned long old_identity_base = os_info_old_value(OS_INFO_IDENTITY_BASE);
phys_addr_t start, end;
u64 idx;
for_each_physmem_range(idx, &oldmem_type, &start, &end) {
- phdr->p_filesz = end - start;
phdr->p_type = PT_LOAD;
+ phdr->p_vaddr = old_identity_base + start;
phdr->p_offset = start;
- phdr->p_vaddr = (unsigned long)__va(start);
phdr->p_paddr = start;
+ phdr->p_filesz = end - start;
phdr->p_memsz = end - start;
phdr->p_flags = PF_R | PF_W | PF_X;
phdr->p_align = PAGE_SIZE;
@@ -518,6 +523,25 @@ static void loads_init(Elf64_Phdr *phdr)
}
/*
+ * Prepare PT_LOAD type program header for kernel image region
+ */
+static void text_init(Elf64_Phdr *phdr)
+{
+ unsigned long start_phys = os_info_old_value(OS_INFO_IMAGE_PHYS);
+ unsigned long start = os_info_old_value(OS_INFO_IMAGE_START);
+ unsigned long end = os_info_old_value(OS_INFO_IMAGE_END);
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_vaddr = start;
+ phdr->p_filesz = end - start;
+ phdr->p_memsz = end - start;
+ phdr->p_offset = start_phys;
+ phdr->p_paddr = start_phys;
+ phdr->p_flags = PF_R | PF_W | PF_X;
+ phdr->p_align = PAGE_SIZE;
+}
+
+/*
* Initialize notes (new kernel)
*/
static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset)
@@ -557,6 +581,8 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt)
size += nt_vmcoreinfo_size();
/* nt_final */
size += sizeof(Elf64_Nhdr);
+ /* PT_LOAD type program header for kernel text region */
+ size += sizeof(Elf64_Phdr);
/* PT_LOADS */
size += mem_chunk_cnt * sizeof(Elf64_Phdr);
@@ -568,7 +594,7 @@ static size_t get_elfcorehdr_size(int mem_chunk_cnt)
*/
int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
{
- Elf64_Phdr *phdr_notes, *phdr_loads;
+ Elf64_Phdr *phdr_notes, *phdr_loads, *phdr_text;
size_t alloc_size;
int mem_chunk_cnt;
void *ptr, *hdr;
@@ -606,14 +632,19 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
/* Init program headers */
phdr_notes = ptr;
ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr));
+ phdr_text = ptr;
+ ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr));
phdr_loads = ptr;
ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt);
/* Init notes */
hdr_off = PTR_DIFF(ptr, hdr);
ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off);
+ /* Init kernel text program header */
+ text_init(phdr_text);
/* Init loads */
- hdr_off = PTR_DIFF(ptr, hdr);
loads_init(phdr_loads);
+ /* Finalize program headers */
+ hdr_off = PTR_DIFF(ptr, hdr);
*addr = (unsigned long long) hdr;
*size = (unsigned long long) hdr_off;
BUG_ON(elfcorehdr_size > alloc_size);
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 1486350a4177..7dc54571f18e 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -1209,8 +1209,8 @@ static struct attribute_group reipl_nss_attr_group = {
void set_os_info_reipl_block(void)
{
- os_info_entry_add(OS_INFO_REIPL_BLOCK, reipl_block_actual,
- reipl_block_actual->hdr.len);
+ os_info_entry_add_data(OS_INFO_REIPL_BLOCK, reipl_block_actual,
+ reipl_block_actual->hdr.len);
}
/* reipl type */
@@ -1940,7 +1940,7 @@ static void dump_reipl_run(struct shutdown_trigger *trigger)
reipl_type == IPL_TYPE_NSS ||
reipl_type == IPL_TYPE_UNKNOWN)
os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR;
- os_info_entry_add(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags));
+ os_info_entry_add_data(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags));
csum = (__force unsigned int)cksm(reipl_block_actual, reipl_block_actual->hdr.len, 0);
abs_lc = get_abs_lowcore();
abs_lc->ipib = __pa(reipl_block_actual);
diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
index d1b16d83e49a..9b8c24ebb008 100644
--- a/arch/s390/kernel/nospec-branch.c
+++ b/arch/s390/kernel/nospec-branch.c
@@ -114,10 +114,10 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end)
type = BRASL_EXPOLINE; /* brasl instruction */
else
continue;
- thunk = instr + (*(int *)(instr + 2)) * 2;
+ thunk = instr + (long)(*(int *)(instr + 2)) * 2;
if (thunk[0] == 0xc6 && thunk[1] == 0x00)
/* exrl %r0,<target-br> */
- br = thunk + (*(int *)(thunk + 2)) * 2;
+ br = thunk + (long)(*(int *)(thunk + 2)) * 2;
else
continue;
if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0)
diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c
index a801e6bd5341..b695f980bbde 100644
--- a/arch/s390/kernel/os_info.c
+++ b/arch/s390/kernel/os_info.c
@@ -15,8 +15,10 @@
#include <asm/checksum.h>
#include <asm/abs_lowcore.h>
#include <asm/os_info.h>
+#include <asm/physmem_info.h>
#include <asm/maccess.h>
#include <asm/asm-offsets.h>
+#include <asm/ipl.h>
/*
* OS info structure has to be page aligned
@@ -43,9 +45,9 @@ void os_info_crashkernel_add(unsigned long base, unsigned long size)
}
/*
- * Add OS info entry and update checksum
+ * Add OS info data entry and update checksum
*/
-void os_info_entry_add(int nr, void *ptr, u64 size)
+void os_info_entry_add_data(int nr, void *ptr, u64 size)
{
os_info.entry[nr].addr = __pa(ptr);
os_info.entry[nr].size = size;
@@ -54,15 +56,36 @@ void os_info_entry_add(int nr, void *ptr, u64 size)
}
/*
+ * Add OS info value entry and update checksum
+ */
+void os_info_entry_add_val(int nr, u64 value)
+{
+ os_info.entry[nr].val = value;
+ os_info.entry[nr].size = 0;
+ os_info.entry[nr].csum = 0;
+ os_info.csum = os_info_csum(&os_info);
+}
+
+/*
* Initialize OS info structure and set lowcore pointer
*/
void __init os_info_init(void)
{
struct lowcore *abs_lc;
+ BUILD_BUG_ON(sizeof(struct os_info) != PAGE_SIZE);
os_info.version_major = OS_INFO_VERSION_MAJOR;
os_info.version_minor = OS_INFO_VERSION_MINOR;
os_info.magic = OS_INFO_MAGIC;
+ os_info_entry_add_val(OS_INFO_IDENTITY_BASE, __identity_base);
+ os_info_entry_add_val(OS_INFO_KASLR_OFFSET, kaslr_offset());
+ os_info_entry_add_val(OS_INFO_KASLR_OFF_PHYS, __kaslr_offset_phys);
+ os_info_entry_add_val(OS_INFO_VMEMMAP, (unsigned long)vmemmap);
+ os_info_entry_add_val(OS_INFO_AMODE31_START, AMODE31_START);
+ os_info_entry_add_val(OS_INFO_AMODE31_END, AMODE31_END);
+ os_info_entry_add_val(OS_INFO_IMAGE_START, (unsigned long)_stext);
+ os_info_entry_add_val(OS_INFO_IMAGE_END, (unsigned long)_end);
+ os_info_entry_add_val(OS_INFO_IMAGE_PHYS, __pa_symbol(_stext));
os_info.csum = os_info_csum(&os_info);
abs_lc = get_abs_lowcore();
abs_lc->os_info = __pa(&os_info);
@@ -125,7 +148,7 @@ static void os_info_old_init(void)
if (os_info_init)
return;
- if (!oldmem_data.start)
+ if (!oldmem_data.start && !is_ipl_type_dump())
goto fail;
if (copy_oldmem_kernel(&addr, __LC_OS_INFO, sizeof(addr)))
goto fail;
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 41ed6e0f0a2a..1434642e9cba 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -428,7 +428,7 @@ static void cpum_cf_make_setsize(enum cpumf_ctr_set ctrset)
case CPUMF_CTR_SET_CRYPTO:
if (cpumf_ctr_info.csvn >= 1 && cpumf_ctr_info.csvn <= 5)
ctrset_size = 16;
- else if (cpumf_ctr_info.csvn == 6 || cpumf_ctr_info.csvn == 7)
+ else if (cpumf_ctr_info.csvn >= 6)
ctrset_size = 20;
break;
case CPUMF_CTR_SET_EXT:
diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c
index 0d64aafd158f..e4a6bfc91080 100644
--- a/arch/s390/kernel/perf_cpum_cf_events.c
+++ b/arch/s390/kernel/perf_cpum_cf_events.c
@@ -855,16 +855,11 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
}
/* Determine version specific crypto set */
- switch (ci.csvn) {
- case 1 ... 5:
+ csvn = none;
+ if (ci.csvn >= 1 && ci.csvn <= 5)
csvn = cpumcf_svn_12345_pmu_event_attr;
- break;
- case 6 ... 7:
+ else if (ci.csvn >= 6)
csvn = cpumcf_svn_67_pmu_event_attr;
- break;
- default:
- csvn = none;
- }
/* Determine model-specific counter set(s) */
get_cpu_id(&cpu_id);
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 24ed33f044ec..cbd5290939df 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -146,10 +146,10 @@ static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31;
static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31;
unsigned long __bootdata_preserved(max_mappable);
-unsigned long __bootdata(ident_map_size);
struct physmem_info __bootdata(physmem_info);
-unsigned long __bootdata_preserved(__kaslr_offset);
+struct vm_layout __bootdata_preserved(vm_layout);
+EXPORT_SYMBOL_GPL(vm_layout);
int __bootdata_preserved(__kaslr_enabled);
unsigned int __bootdata_preserved(zlib_dfltcc_support);
EXPORT_SYMBOL(zlib_dfltcc_support);
@@ -765,7 +765,7 @@ static void __init relocate_amode31_section(void)
unsigned long amode31_size = __eamode31 - __samode31;
long amode31_offset, *ptr;
- amode31_offset = physmem_info.reserved[RR_AMODE31].start - (unsigned long)__samode31;
+ amode31_offset = AMODE31_START - (unsigned long)__samode31;
pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size);
/* Move original AMODE31 section to the new one */
diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c
index 94f440e38303..7c294da45bf5 100644
--- a/arch/s390/kernel/stacktrace.c
+++ b/arch/s390/kernel/stacktrace.c
@@ -101,3 +101,22 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
}
pagefault_enable();
}
+
+unsigned long return_address(unsigned int n)
+{
+ struct unwind_state state;
+ unsigned long addr;
+
+ /* Increment to skip current stack entry */
+ n++;
+
+ unwind_for_each_frame(&state, NULL, NULL, 0) {
+ addr = unwind_get_return_address(&state);
+ if (!addr)
+ break;
+ if (!n--)
+ return addr;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(return_address);
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index fc07bc39e698..265fea37e030 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -21,6 +21,7 @@
/* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */
#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
int __bootdata_preserved(prot_virt_guest);
+EXPORT_SYMBOL(prot_virt_guest);
#endif
/*
@@ -181,36 +182,36 @@ int uv_convert_owned_from_secure(unsigned long paddr)
}
/*
- * Calculate the expected ref_count for a page that would otherwise have no
+ * Calculate the expected ref_count for a folio that would otherwise have no
* further pins. This was cribbed from similar functions in other places in
* the kernel, but with some slight modifications. We know that a secure
- * page can not be a huge page for example.
+ * folio can not be a large folio, for example.
*/
-static int expected_page_refs(struct page *page)
+static int expected_folio_refs(struct folio *folio)
{
int res;
- res = page_mapcount(page);
- if (PageSwapCache(page)) {
+ res = folio_mapcount(folio);
+ if (folio_test_swapcache(folio)) {
res++;
- } else if (page_mapping(page)) {
+ } else if (folio_mapping(folio)) {
res++;
- if (page_has_private(page))
+ if (folio->private)
res++;
}
return res;
}
-static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
+static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
{
int expected, cc = 0;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
return -EAGAIN;
- expected = expected_page_refs(page);
- if (!page_ref_freeze(page, expected))
+ expected = expected_folio_refs(folio);
+ if (!folio_ref_freeze(folio, expected))
return -EBUSY;
- set_bit(PG_arch_1, &page->flags);
+ set_bit(PG_arch_1, &folio->flags);
/*
* If the UVC does not succeed or fail immediately, we don't want to
* loop for long, or we might get stall notifications.
@@ -220,9 +221,9 @@ static int make_page_secure(struct page *page, struct uv_cb_header *uvcb)
* -EAGAIN and we let the callers deal with it.
*/
cc = __uv_call(0, (u64)uvcb);
- page_ref_unfreeze(page, expected);
+ folio_ref_unfreeze(folio, expected);
/*
- * Return -ENXIO if the page was not mapped, -EINVAL for other errors.
+ * Return -ENXIO if the folio was not mapped, -EINVAL for other errors.
* If busy or partially completed, return -EAGAIN.
*/
if (cc == UVC_CC_OK)
@@ -277,7 +278,7 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
bool local_drain = false;
spinlock_t *ptelock;
unsigned long uaddr;
- struct page *page;
+ struct folio *folio;
pte_t *ptep;
int rc;
@@ -306,15 +307,19 @@ again:
if (!ptep)
goto out;
if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) {
- page = pte_page(*ptep);
+ folio = page_folio(pte_page(*ptep));
+ rc = -EINVAL;
+ if (folio_test_large(folio))
+ goto unlock;
rc = -EAGAIN;
- if (trylock_page(page)) {
+ if (folio_trylock(folio)) {
if (should_export_before_import(uvcb, gmap->mm))
- uv_convert_from_secure(page_to_phys(page));
- rc = make_page_secure(page, uvcb);
- unlock_page(page);
+ uv_convert_from_secure(PFN_PHYS(folio_pfn(folio)));
+ rc = make_folio_secure(folio, uvcb);
+ folio_unlock(folio);
}
}
+unlock:
pte_unmap_unlock(ptep, ptelock);
out:
mmap_read_unlock(gmap->mm);
@@ -324,10 +329,10 @@ out:
* If we are here because the UVC returned busy or partial
* completion, this is just a useless check, but it is safe.
*/
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
} else if (rc == -EBUSY) {
/*
- * If we have tried a local drain and the page refcount
+ * If we have tried a local drain and the folio refcount
* still does not match our expected safe value, try with a
* system wide drain. This is needed if the pagevecs holding
* the page are on a different CPU.
@@ -338,7 +343,7 @@ out:
return -EAGAIN;
}
/*
- * We are here if the page refcount does not match the
+ * We are here if the folio refcount does not match the
* expected safe value. The main culprits are usually
* pagevecs. With lru_add_drain() we drain the pagevecs
* on the local CPU so that hopefully the refcount will
diff --git a/arch/s390/kernel/vmcore_info.c b/arch/s390/kernel/vmcore_info.c
index d296dfc22191..23f7d7619a99 100644
--- a/arch/s390/kernel/vmcore_info.c
+++ b/arch/s390/kernel/vmcore_info.c
@@ -14,7 +14,9 @@ void arch_crash_save_vmcoreinfo(void)
VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
vmcoreinfo_append_str("SAMODE31=%lx\n", (unsigned long)__samode31);
vmcoreinfo_append_str("EAMODE31=%lx\n", (unsigned long)__eamode31);
+ vmcoreinfo_append_str("IDENTITYBASE=%lx\n", __identity_base);
vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
+ vmcoreinfo_append_str("KERNELOFFPHYS=%lx\n", __kaslr_offset_phys);
abs_lc = get_abs_lowcore();
abs_lc->vmcore_info = paddr_vmcoreinfo_note();
put_abs_lowcore(abs_lc);
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 48de296e8905..a1ce3925ec71 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -39,7 +39,7 @@ PHDRS {
SECTIONS
{
- . = 0x100000;
+ . = __START_KERNEL;
.text : {
_stext = .; /* Start of text section */
_text = .; /* Text and read-only data */
@@ -183,7 +183,7 @@ SECTIONS
.amode31.data : {
*(.amode31.data)
}
- . = ALIGN(PAGE_SIZE);
+ . = _samode31 + AMODE31_SIZE;
_eamode31 = .;
/* early.c uses stsi, which requires page aligned data. */
@@ -192,31 +192,6 @@ SECTIONS
PERCPU_SECTION(0x100)
-#ifdef CONFIG_PIE_BUILD
- .dynsym ALIGN(8) : {
- __dynsym_start = .;
- *(.dynsym)
- __dynsym_end = .;
- }
- .rela.dyn ALIGN(8) : {
- __rela_dyn_start = .;
- *(.rela*)
- __rela_dyn_end = .;
- }
- .dynamic ALIGN(8) : {
- *(.dynamic)
- }
- .dynstr ALIGN(8) : {
- *(.dynstr)
- }
-#endif
- .hash ALIGN(8) : {
- *(.hash)
- }
- .gnu.hash ALIGN(8) : {
- *(.gnu.hash)
- }
-
. = ALIGN(PAGE_SIZE);
__init_end = .; /* freed after init ends here */
@@ -230,7 +205,6 @@ SECTIONS
* it should match struct vmlinux_info
*/
.vmlinux.info 0 (INFO) : {
- QUAD(_stext) /* default_lma */
QUAD(startup_continue) /* entry */
QUAD(__bss_start - _stext) /* image_size */
QUAD(__bss_stop - __bss_start) /* bss_size */
@@ -239,14 +213,8 @@ SECTIONS
QUAD(__boot_data_preserved_start) /* bootdata_preserved_off */
QUAD(__boot_data_preserved_end -
__boot_data_preserved_start) /* bootdata_preserved_size */
-#ifdef CONFIG_PIE_BUILD
- QUAD(__dynsym_start) /* dynsym_start */
- QUAD(__rela_dyn_start) /* rela_dyn_start */
- QUAD(__rela_dyn_end) /* rela_dyn_end */
-#else
QUAD(__got_start) /* got_start */
QUAD(__got_end) /* got_end */
-#endif
QUAD(_eamode31 - _samode31) /* amode31_size */
QUAD(init_mm)
QUAD(swapper_pg_dir)
@@ -282,12 +250,10 @@ SECTIONS
*(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt)
}
ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
-#ifndef CONFIG_PIE_BUILD
.rela.dyn : {
*(.rela.*) *(.rela_*)
}
ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
-#endif
/* Sections to be discarded */
DISCARDS
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 7721eb522f43..82e9631cd9ef 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2631,9 +2631,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
if (r)
break;
- mmap_write_lock(current->mm);
- r = gmap_mark_unmergeable();
- mmap_write_unlock(current->mm);
+ r = s390_disable_cow_sharing();
if (r)
break;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index b2c9f010f0fe..c9ecae830634 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -12,6 +12,7 @@
#include <linux/list.h>
#include <linux/bitmap.h>
#include <linux/sched/signal.h>
+#include <linux/io.h>
#include <asm/gmap.h>
#include <asm/mmu_context.h>
@@ -361,7 +362,7 @@ end:
case -EACCES:
return set_validity_icpt(scb_s, 0x003CU);
}
- scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2;
+ scb_s->crycbd = (u32)virt_to_phys(&vsie_page->crycb) | CRYCB_FORMAT2;
return 0;
}
@@ -1005,7 +1006,7 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (read_guest_real(vcpu, fac, &vsie_page->fac,
stfle_size() * sizeof(u64)))
return set_validity_icpt(scb_s, 0x1090U);
- scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+ scb_s->fac = (u32)virt_to_phys(&vsie_page->fac);
}
return 0;
}
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index 90eac15ea62a..f43f897d3fc0 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -23,4 +23,4 @@ obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
-obj-$(CONFIG_EXPOLINE_EXTERN) += expoline/
+obj-$(CONFIG_EXPOLINE_EXTERN) += expoline.o
diff --git a/arch/s390/lib/expoline/expoline.S b/arch/s390/lib/expoline.S
index 92ed8409a7a4..92ed8409a7a4 100644
--- a/arch/s390/lib/expoline/expoline.S
+++ b/arch/s390/lib/expoline.S
diff --git a/arch/s390/lib/expoline/Makefile b/arch/s390/lib/expoline/Makefile
deleted file mode 100644
index 854631d9cb03..000000000000
--- a/arch/s390/lib/expoline/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-obj-y += expoline.o
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 12d22a7fa32f..474a25ca5c48 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2550,41 +2550,6 @@ static inline void thp_split_mm(struct mm_struct *mm)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
- * Remove all empty zero pages from the mapping for lazy refaulting
- * - This must be called after mm->context.has_pgste is set, to avoid
- * future creation of zero pages
- * - This must be called after THP was disabled.
- *
- * mm contracts with s390, that even if mm were to remove a page table,
- * racing with the loop below and so causing pte_offset_map_lock() to fail,
- * it will never insert a page table containing empty zero pages once
- * mm_forbids_zeropage(mm) i.e. mm->context.has_pgste is set.
- */
-static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
- unsigned long end, struct mm_walk *walk)
-{
- unsigned long addr;
-
- for (addr = start; addr != end; addr += PAGE_SIZE) {
- pte_t *ptep;
- spinlock_t *ptl;
-
- ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!ptep)
- break;
- if (is_zero_pfn(pte_pfn(*ptep)))
- ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
- pte_unmap_unlock(ptep, ptl);
- }
- return 0;
-}
-
-static const struct mm_walk_ops zap_zero_walk_ops = {
- .pmd_entry = __zap_zero_pages,
- .walk_lock = PGWALK_WRLOCK,
-};
-
-/*
* switch on pgstes for its userspace process (for kvm)
*/
int s390_enable_sie(void)
@@ -2601,22 +2566,142 @@ int s390_enable_sie(void)
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
- walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
mmap_write_unlock(mm);
return 0;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
-int gmap_mark_unmergeable(void)
+static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ unsigned long *found_addr = walk->private;
+
+ /* Return 1 of the page is a zeropage. */
+ if (is_zero_pfn(pte_pfn(*pte))) {
+ /*
+ * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
+ * right thing and likely don't care: FAULT_FLAG_UNSHARE
+ * currently only works in COW mappings, which is also where
+ * mm_forbids_zeropage() is checked.
+ */
+ if (!is_cow_mapping(walk->vma->vm_flags))
+ return -EFAULT;
+
+ *found_addr = addr;
+ return 1;
+ }
+ return 0;
+}
+
+static const struct mm_walk_ops find_zeropage_ops = {
+ .pte_entry = find_zeropage_pte_entry,
+ .walk_lock = PGWALK_WRLOCK,
+};
+
+/*
+ * Unshare all shared zeropages, replacing them by anonymous pages. Note that
+ * we cannot simply zap all shared zeropages, because this could later
+ * trigger unexpected userfaultfd missing events.
+ *
+ * This must be called after mm->context.allow_cow_sharing was
+ * set to 0, to avoid future mappings of shared zeropages.
+ *
+ * mm contracts with s390, that even if mm were to remove a page table,
+ * and racing with walk_page_range_vma() calling pte_offset_map_lock()
+ * would fail, it will never insert a page table containing empty zero
+ * pages once mm_forbids_zeropage(mm) i.e.
+ * mm->context.allow_cow_sharing is set to 0.
+ */
+static int __s390_unshare_zeropages(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+ unsigned long addr;
+ vm_fault_t fault;
+ int rc;
+
+ for_each_vma(vmi, vma) {
+ /*
+ * We could only look at COW mappings, but it's more future
+ * proof to catch unexpected zeropages in other mappings and
+ * fail.
+ */
+ if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
+ continue;
+ addr = vma->vm_start;
+
+retry:
+ rc = walk_page_range_vma(vma, addr, vma->vm_end,
+ &find_zeropage_ops, &addr);
+ if (rc < 0)
+ return rc;
+ else if (!rc)
+ continue;
+
+ /* addr was updated by find_zeropage_pte_entry() */
+ fault = handle_mm_fault(vma, addr,
+ FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+ NULL);
+ if (fault & VM_FAULT_OOM)
+ return -ENOMEM;
+ /*
+ * See break_ksm(): even after handle_mm_fault() returned 0, we
+ * must start the lookup from the current address, because
+ * handle_mm_fault() may back out if there's any difficulty.
+ *
+ * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
+ * maybe they could trigger in the future on concurrent
+ * truncation. In that case, the shared zeropage would be gone
+ * and we can simply retry and make progress.
+ */
+ cond_resched();
+ goto retry;
+ }
+
+ return 0;
+}
+
+static int __s390_disable_cow_sharing(struct mm_struct *mm)
{
+ int rc;
+
+ if (!mm->context.allow_cow_sharing)
+ return 0;
+
+ mm->context.allow_cow_sharing = 0;
+
+ /* Replace all shared zeropages by anonymous pages. */
+ rc = __s390_unshare_zeropages(mm);
/*
* Make sure to disable KSM (if enabled for the whole process or
* individual VMAs). Note that nothing currently hinders user space
* from re-enabling it.
*/
- return ksm_disable(current->mm);
+ if (!rc)
+ rc = ksm_disable(mm);
+ if (rc)
+ mm->context.allow_cow_sharing = 1;
+ return rc;
+}
+
+/*
+ * Disable most COW-sharing of memory pages for the whole process:
+ * (1) Disable KSM and unmerge/unshare any KSM pages.
+ * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
+ *
+ * Not that we currently don't bother with COW-shared pages that are shared
+ * with parent/child processes due to fork().
+ */
+int s390_disable_cow_sharing(void)
+{
+ int rc;
+
+ mmap_write_lock(current->mm);
+ rc = __s390_disable_cow_sharing(current->mm);
+ mmap_write_unlock(current->mm);
+ return rc;
}
-EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
+EXPORT_SYMBOL_GPL(s390_disable_cow_sharing);
/*
* Enable storage key handling from now on and initialize the storage
@@ -2685,7 +2770,7 @@ int s390_enable_skey(void)
goto out_up;
mm->context.uses_skeys = 1;
- rc = gmap_mark_unmergeable();
+ rc = __s390_disable_cow_sharing(mm);
if (rc) {
mm->context.uses_skeys = 0;
goto out_up;
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 85cddf904cb2..41c714e21292 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -13,7 +13,9 @@
#include <linux/slab.h>
#include <linux/sort.h>
#include <asm/page-states.h>
+#include <asm/abs_lowcore.h>
#include <asm/cacheflush.h>
+#include <asm/maccess.h>
#include <asm/nospec-branch.h>
#include <asm/ctlreg.h>
#include <asm/pgalloc.h>
@@ -21,6 +23,7 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/set_memory.h>
+#include <asm/physmem_info.h>
static DEFINE_MUTEX(vmem_mutex);
@@ -436,7 +439,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
return -EINVAL;
/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
- if (WARN_ON_ONCE(end > VMALLOC_START))
+ if (WARN_ON_ONCE(end > __abs_lowcore))
return -EINVAL;
for (addr = start; addr < end; addr = next) {
next = pgd_addr_end(addr, end);
diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c
index a0b872b74fe3..0f4f1e8fc480 100644
--- a/arch/s390/pci/pci_sysfs.c
+++ b/arch/s390/pci/pci_sysfs.c
@@ -172,7 +172,6 @@ static ssize_t uid_is_unique_show(struct device *dev,
}
static DEVICE_ATTR_RO(uid_is_unique);
-#ifndef CONFIG_DMI
/* analogous to smbios index */
static ssize_t index_show(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -202,7 +201,6 @@ static struct attribute_group zpci_ident_attr_group = {
.attrs = zpci_ident_attrs,
.is_visible = zpci_index_is_visible,
};
-#endif
static struct bin_attribute *zpci_bin_attrs[] = {
&bin_attr_util_string,
@@ -245,8 +243,6 @@ static struct attribute_group pfip_attr_group = {
const struct attribute_group *zpci_attr_groups[] = {
&zpci_attr_group,
&pfip_attr_group,
-#ifndef CONFIG_DMI
&zpci_ident_attr_group,
-#endif
NULL,
};
diff --git a/arch/s390/tools/relocs.c b/arch/s390/tools/relocs.c
index 30a732c808f3..a74dbd5c9896 100644
--- a/arch/s390/tools/relocs.c
+++ b/arch/s390/tools/relocs.c
@@ -280,7 +280,7 @@ static int do_reloc(struct section *sec, Elf_Rel *rel)
case R_390_GOTOFF64:
break;
case R_390_64:
- add_reloc(&relocs64, offset);
+ add_reloc(&relocs64, offset - ehdr.e_entry);
break;
default:
die("Unsupported relocation type: %d\n", r_type);