From 8cf57d7217c32133d25615324c0ab4aaacf4d9c4 Mon Sep 17 00:00:00 2001 From: Anastasia Eskova Date: Fri, 28 Apr 2023 06:42:42 -0400 Subject: s390: add support for user-defined certificates Enable receiving the user-defined certificates from the s390x hypervisor via new diagnose 0x320 calls, and make them available to the Linux root user as 'cert_store_key' type keys in a so-called 'cert_store' keyring. New user-space interfaces: /sys/firmware/cert_store/refresh Writing to this attribute re-fetches certificates via DIAG 0x320 /sys/firmware/cert_store/cs_status Reading from this attribute returns either of: "uninitialized" If no certificate has been retrieved yet "ok" If certificates have been successfully retrieved "failed ()" If certificate retrieval failed with reason code New debug trace areas: /sys/kernel/debug/s390dbf/cert_store_msg /sys/kernel/debug/s390dbf/cert_store_hexdump Usage example: To initiate request for certificates available to the system as root: $ echo 1 > /sys/firmware/cert_store/refresh Upon success the '/sys/firmware/cert_store/cs_status' contains the value 'ok'. $ cat /sys/firmware/cert_store/cs_status ok Get the ID of the keyring 'cert_store': $ keyctl search @us keyring cert_store OR $ keyctl link @us @s; keyctl request keyring cert_store Obtain list of IDs of certificates: $ keyctl rlist Display certificate content as hex-dump: $ keyctl read Read certificate contents as binary data: $ keyctl pipe >cert_data Display certificate description: $ keyctl describe The certificate description has the following format: <64 bytes certificate name in EBCDIC> ':' ':' The certificate description in /proc/keys has certificate name represented in ASCII. Users can read but cannot update the content of the certificate. Signed-off-by: Anastasia Eskova Reviewed-by: Peter Oberparleiter Acked-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 10 + arch/s390/include/asm/diag.h | 1 + arch/s390/include/asm/sclp.h | 1 + arch/s390/kernel/Makefile | 2 +- arch/s390/kernel/cert_store.c | 810 +++++++++++++++++++++++++++++++++++++++++ arch/s390/kernel/diag.c | 1 + drivers/s390/char/sclp_early.c | 1 + 7 files changed, 825 insertions(+), 1 deletion(-) create mode 100644 arch/s390/kernel/cert_store.c diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 5b39918b7042..c0afca69904e 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -512,6 +512,16 @@ config KEXEC_SIG verification for the corresponding kernel image type being loaded in order for this to work. +config CERT_STORE + bool "Get user certificates via DIAG320" + depends on KEYS + help + Enable this option if you want to access user-provided secure boot + certificates via DIAG 0x320. + + These certificates will be made available via the keyring named + 'cert_store'. + config KERNEL_NOBP def_bool n prompt "Enable modified branch prediction for the kernel by default" diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 902e0330dd91..fb5a886ff47f 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -36,6 +36,7 @@ enum diag_stat_enum { DIAG_STAT_X304, DIAG_STAT_X308, DIAG_STAT_X318, + DIAG_STAT_X320, DIAG_STAT_X500, NR_DIAG_STAT }; diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index dac7da88f61f..5742d23bba13 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -86,6 +86,7 @@ struct sclp_info { unsigned char has_kss : 1; unsigned char has_gisaf : 1; unsigned char has_diag318 : 1; + unsigned char has_diag320 : 1; unsigned char has_sipl : 1; unsigned char has_sipl_eckd : 1; unsigned char has_dirq : 1; diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 6b2a051e1f8a..8d7514c72bb8 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -68,7 +68,7 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o - +obj-$(CONFIG_CERT_STORE) += cert_store.o obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c new file mode 100644 index 000000000000..1cbeb9ce0eb1 --- /dev/null +++ b/arch/s390/kernel/cert_store.c @@ -0,0 +1,810 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DIAG 0x320 support and certificate store handling + * + * Copyright IBM Corp. 2023 + * Author(s): Anastasia Eskova + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DIAG_MAX_RETRIES 10 + +#define VCE_FLAGS_VALID_MASK 0x80 + +#define ISM_LEN_DWORDS 4 +#define VCSSB_LEN_BYTES 128 +#define VCSSB_LEN_NO_CERTS 4 +#define VCB_LEN_NO_CERTS 64 +#define VC_NAME_LEN_BYTES 64 + +#define CERT_STORE_KEY_TYPE_NAME "cert_store_key" +#define CERT_STORE_KEYRING_NAME "cert_store" + +static debug_info_t *cert_store_dbf; +static debug_info_t *cert_store_hexdump; + +#define pr_dbf_msg(fmt, ...) \ + debug_sprintf_event(cert_store_dbf, 3, fmt "\n", ## __VA_ARGS__) + +enum diag320_subcode { + DIAG320_SUBCODES = 0, + DIAG320_STORAGE = 1, + DIAG320_CERT_BLOCK = 2, +}; + +enum diag320_rc { + DIAG320_RC_OK = 0x0001, + DIAG320_RC_CS_NOMATCH = 0x0306, +}; + +/* Verification Certificates Store Support Block (VCSSB). */ +struct vcssb { + u32 vcssb_length; + u8 pad_0x04[3]; + u8 version; + u8 pad_0x08[8]; + u32 cs_token; + u8 pad_0x14[12]; + u16 total_vc_index_count; + u16 max_vc_index_count; + u8 pad_0x24[28]; + u32 max_vce_length; + u32 max_vcxe_length; + u8 pad_0x48[8]; + u32 max_single_vcb_length; + u32 total_vcb_length; + u32 max_single_vcxb_length; + u32 total_vcxb_length; + u8 pad_0x60[32]; +} __packed __aligned(8); + +/* Verification Certificate Entry (VCE) Header. */ +struct vce_header { + u32 vce_length; + u8 flags; + u8 key_type; + u16 vc_index; + u8 vc_name[VC_NAME_LEN_BYTES]; /* EBCDIC */ + u8 vc_format; + u8 pad_0x49; + u16 key_id_length; + u8 pad_0x4c; + u8 vc_hash_type; + u16 vc_hash_length; + u8 pad_0x50[4]; + u32 vc_length; + u8 pad_0x58[8]; + u16 vc_hash_offset; + u16 vc_offset; + u8 pad_0x64[28]; +} __packed __aligned(4); + +/* Verification Certificate Block (VCB) Header. */ +struct vcb_header { + u32 vcb_input_length; + u8 pad_0x04[4]; + u16 first_vc_index; + u16 last_vc_index; + u32 pad_0x0c; + u32 cs_token; + u8 pad_0x14[12]; + u32 vcb_output_length; + u8 pad_0x24[3]; + u8 version; + u16 stored_vc_count; + u16 remaining_vc_count; + u8 pad_0x2c[20]; +} __packed __aligned(4); + +/* Verification Certificate Block (VCB). */ +struct vcb { + struct vcb_header vcb_hdr; + u8 vcb_buf[]; +} __packed __aligned(4); + +/* Verification Certificate Entry (VCE). */ +struct vce { + struct vce_header vce_hdr; + u8 cert_data_buf[]; +} __packed __aligned(4); + +static void cert_store_key_describe(const struct key *key, struct seq_file *m) +{ + char ascii[VC_NAME_LEN_BYTES + 1]; + + /* + * First 64 bytes of the key description is key name in EBCDIC CP 500. + * Convert it to ASCII for displaying in /proc/keys. + */ + strscpy(ascii, key->description, sizeof(ascii)); + EBCASC_500(ascii, VC_NAME_LEN_BYTES); + seq_puts(m, ascii); + + seq_puts(m, &key->description[VC_NAME_LEN_BYTES]); + if (key_is_positive(key)) + seq_printf(m, ": %u", key->datalen); +} + +/* + * Certificate store key type takes over properties of + * user key but cannot be updated. + */ +static struct key_type key_type_cert_store_key = { + .name = CERT_STORE_KEY_TYPE_NAME, + .preparse = user_preparse, + .free_preparse = user_free_preparse, + .instantiate = generic_key_instantiate, + .revoke = user_revoke, + .destroy = user_destroy, + .describe = cert_store_key_describe, + .read = user_read, +}; + +/* Logging functions. */ +static void pr_dbf_vcb(const struct vcb *b) +{ + pr_dbf_msg("VCB Header:"); + pr_dbf_msg("vcb_input_length: %d", b->vcb_hdr.vcb_input_length); + pr_dbf_msg("first_vc_index: %d", b->vcb_hdr.first_vc_index); + pr_dbf_msg("last_vc_index: %d", b->vcb_hdr.last_vc_index); + pr_dbf_msg("cs_token: %d", b->vcb_hdr.cs_token); + pr_dbf_msg("vcb_output_length: %d", b->vcb_hdr.vcb_output_length); + pr_dbf_msg("version: %d", b->vcb_hdr.version); + pr_dbf_msg("stored_vc_count: %d", b->vcb_hdr.stored_vc_count); + pr_dbf_msg("remaining_vc_count: %d", b->vcb_hdr.remaining_vc_count); +} + +static void pr_dbf_vce(const struct vce *e) +{ + unsigned char vc_name[VC_NAME_LEN_BYTES + 1]; + char log_string[VC_NAME_LEN_BYTES + 40]; + + pr_dbf_msg("VCE Header:"); + pr_dbf_msg("vce_hdr.vce_length: %d", e->vce_hdr.vce_length); + pr_dbf_msg("vce_hdr.flags: %d", e->vce_hdr.flags); + pr_dbf_msg("vce_hdr.key_type: %d", e->vce_hdr.key_type); + pr_dbf_msg("vce_hdr.vc_index: %d", e->vce_hdr.vc_index); + pr_dbf_msg("vce_hdr.vc_format: %d", e->vce_hdr.vc_format); + pr_dbf_msg("vce_hdr.key_id_length: %d", e->vce_hdr.key_id_length); + pr_dbf_msg("vce_hdr.vc_hash_type: %d", e->vce_hdr.vc_hash_type); + pr_dbf_msg("vce_hdr.vc_hash_length: %d", e->vce_hdr.vc_hash_length); + pr_dbf_msg("vce_hdr.vc_hash_offset: %d", e->vce_hdr.vc_hash_offset); + pr_dbf_msg("vce_hdr.vc_length: %d", e->vce_hdr.vc_length); + pr_dbf_msg("vce_hdr.vc_offset: %d", e->vce_hdr.vc_offset); + + /* Certificate name in ASCII. */ + memcpy(vc_name, e->vce_hdr.vc_name, VC_NAME_LEN_BYTES); + EBCASC_500(vc_name, VC_NAME_LEN_BYTES); + vc_name[VC_NAME_LEN_BYTES] = '\0'; + + snprintf(log_string, sizeof(log_string), + "index: %d vce_hdr.vc_name (ASCII): %s", + e->vce_hdr.vc_index, vc_name); + debug_text_event(cert_store_hexdump, 3, log_string); + + /* Certificate data. */ + debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data start"); + debug_event(cert_store_hexdump, 3, (u8 *)e->cert_data_buf, 128); + debug_text_event(cert_store_hexdump, 3, "VCE: Certificate data end"); + debug_event(cert_store_hexdump, 3, + (u8 *)e->cert_data_buf + e->vce_hdr.vce_length - 128, 128); +} + +static void pr_dbf_vcssb(const struct vcssb *s) +{ + debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode1"); + debug_event(cert_store_hexdump, 3, (u8 *)s, VCSSB_LEN_BYTES); + + pr_dbf_msg("VCSSB:"); + pr_dbf_msg("vcssb_length: %u", s->vcssb_length); + pr_dbf_msg("version: %u", s->version); + pr_dbf_msg("cs_token: %u", s->cs_token); + pr_dbf_msg("total_vc_index_count: %u", s->total_vc_index_count); + pr_dbf_msg("max_vc_index_count: %u", s->max_vc_index_count); + pr_dbf_msg("max_vce_length: %u", s->max_vce_length); + pr_dbf_msg("max_vcxe_length: %u", s->max_vce_length); + pr_dbf_msg("max_single_vcb_length: %u", s->max_single_vcb_length); + pr_dbf_msg("total_vcb_length: %u", s->total_vcb_length); + pr_dbf_msg("max_single_vcxb_length: %u", s->max_single_vcxb_length); + pr_dbf_msg("total_vcxb_length: %u", s->total_vcxb_length); +} + +static int __diag320(unsigned long subcode, void *addr) +{ + union register_pair rp = { .even = (unsigned long)addr, }; + + asm volatile( + " diag %[rp],%[subcode],0x320\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : [rp] "+d" (rp.pair) + : [subcode] "d" (subcode) + : "cc", "memory"); + + return rp.odd; +} + +static int diag320(unsigned long subcode, void *addr) +{ + diag_stat_inc(DIAG_STAT_X320); + + return __diag320(subcode, addr); +} + +/* + * Calculate SHA256 hash of the VCE certificate and compare it to hash stored in + * VCE. Return -EINVAL if hashes don't match. + */ +static int check_certificate_hash(const struct vce *vce) +{ + u8 hash[SHA256_DIGEST_SIZE]; + u16 vc_hash_length; + u8 *vce_hash; + + vce_hash = (u8 *)vce + vce->vce_hdr.vc_hash_offset; + vc_hash_length = vce->vce_hdr.vc_hash_length; + sha256((u8 *)vce + vce->vce_hdr.vc_offset, vce->vce_hdr.vc_length, hash); + if (memcmp(vce_hash, hash, vc_hash_length) == 0) + return 0; + + pr_dbf_msg("SHA256 hash of received certificate does not match"); + debug_text_event(cert_store_hexdump, 3, "VCE hash:"); + debug_event(cert_store_hexdump, 3, vce_hash, SHA256_DIGEST_SIZE); + debug_text_event(cert_store_hexdump, 3, "Calculated hash:"); + debug_event(cert_store_hexdump, 3, hash, SHA256_DIGEST_SIZE); + + return -EINVAL; +} + +static int check_certificate_valid(const struct vce *vce) +{ + if (!(vce->vce_hdr.flags & VCE_FLAGS_VALID_MASK)) { + pr_dbf_msg("Certificate entry is invalid"); + return -EINVAL; + } + if (vce->vce_hdr.vc_format != 1) { + pr_dbf_msg("Certificate format is not supported"); + return -EINVAL; + } + if (vce->vce_hdr.vc_hash_type != 1) { + pr_dbf_msg("Hash type is not supported"); + return -EINVAL; + } + + return check_certificate_hash(vce); +} + +static struct key *get_user_session_keyring(void) +{ + key_ref_t us_keyring_ref; + + us_keyring_ref = lookup_user_key(KEY_SPEC_USER_SESSION_KEYRING, + KEY_LOOKUP_CREATE, KEY_NEED_LINK); + if (IS_ERR(us_keyring_ref)) { + pr_dbf_msg("Couldn't get user session keyring: %ld", + PTR_ERR(us_keyring_ref)); + return ERR_PTR(-ENOKEY); + } + key_ref_put(us_keyring_ref); + return key_ref_to_ptr(us_keyring_ref); +} + +/* Invalidate all keys from cert_store keyring. */ +static int invalidate_keyring_keys(struct key *keyring) +{ + unsigned long num_keys, key_index; + size_t keyring_payload_len; + key_serial_t *key_array; + struct key *current_key; + int rc; + + keyring_payload_len = key_type_keyring.read(keyring, NULL, 0); + num_keys = keyring_payload_len / sizeof(key_serial_t); + key_array = kcalloc(num_keys, sizeof(key_serial_t), GFP_KERNEL); + if (!key_array) + return -ENOMEM; + + rc = key_type_keyring.read(keyring, (char *)key_array, keyring_payload_len); + if (rc != keyring_payload_len) { + pr_dbf_msg("Couldn't read keyring payload"); + goto out; + } + + for (key_index = 0; key_index < num_keys; key_index++) { + current_key = key_lookup(key_array[key_index]); + pr_dbf_msg("Invalidating key %08x", current_key->serial); + + key_invalidate(current_key); + key_put(current_key); + rc = key_unlink(keyring, current_key); + if (rc) { + pr_dbf_msg("Couldn't unlink key %08x: %d", current_key->serial, rc); + break; + } + } +out: + kfree(key_array); + return rc; +} + +static struct key *find_cs_keyring(void) +{ + key_ref_t cs_keyring_ref; + struct key *cs_keyring; + + cs_keyring_ref = keyring_search(make_key_ref(get_user_session_keyring(), true), + &key_type_keyring, CERT_STORE_KEYRING_NAME, + false); + if (!IS_ERR(cs_keyring_ref)) { + cs_keyring = key_ref_to_ptr(cs_keyring_ref); + key_ref_put(cs_keyring_ref); + goto found; + } + /* Search default locations: thread, process, session keyrings */ + cs_keyring = request_key(&key_type_keyring, CERT_STORE_KEYRING_NAME, NULL); + if (IS_ERR(cs_keyring)) + return NULL; + key_put(cs_keyring); +found: + return cs_keyring; +} + +static void cleanup_cs_keys(void) +{ + struct key *cs_keyring; + + cs_keyring = find_cs_keyring(); + if (!cs_keyring) + return; + + pr_dbf_msg("Found cert_store keyring. Purging..."); + /* + * Remove cert_store_key_type in case invalidation + * of old cert_store keys failed (= severe error). + */ + if (invalidate_keyring_keys(cs_keyring)) + unregister_key_type(&key_type_cert_store_key); + + keyring_clear(cs_keyring); + key_invalidate(cs_keyring); + key_put(cs_keyring); + key_unlink(get_user_session_keyring(), cs_keyring); +} + +static struct key *create_cs_keyring(void) +{ + static struct key *cs_keyring; + + /* Cleanup previous cs_keyring and all associated keys if any. */ + cleanup_cs_keys(); + cs_keyring = keyring_alloc(CERT_STORE_KEYRING_NAME, GLOBAL_ROOT_UID, + GLOBAL_ROOT_GID, current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_SET_KEEP, + NULL, get_user_session_keyring()); + if (IS_ERR(cs_keyring)) { + pr_dbf_msg("Can't allocate cert_store keyring"); + return NULL; + } + + pr_dbf_msg("Successfully allocated cert_store keyring: %08x", cs_keyring->serial); + + /* + * In case a previous clean-up ran into an + * error and unregistered key type. + */ + register_key_type(&key_type_cert_store_key); + + return cs_keyring; +} + +/* + * Allocate memory and create key description in format + * [key name in EBCDIC]:[VCE index]:[CS token]. + * Return a pointer to key description or NULL if memory + * allocation failed. Memory should be freed by caller. + */ +static char *get_key_description(struct vcssb *vcssb, const struct vce *vce) +{ + size_t len, name_len; + u32 cs_token; + char *desc; + + cs_token = vcssb->cs_token; + /* Description string contains "%64s:%04u:%08u\0". */ + name_len = sizeof(vce->vce_hdr.vc_name); + len = name_len + 1 + 4 + 1 + 8 + 1; + desc = kmalloc(len, GFP_KERNEL); + if (!desc) + return NULL; + + memcpy(desc, vce->vce_hdr.vc_name, name_len); + sprintf(desc + name_len, ":%04u:%08u", vce->vce_hdr.vc_index, cs_token); + + return desc; +} + +/* + * Create a key of type "cert_store_key" using the data from VCE for key + * payload and key description. Link the key to "cert_store" keyring. + */ +static int create_key_from_vce(struct vcssb *vcssb, struct vce *vce, + struct key *keyring) +{ + key_ref_t newkey; + char *desc; + int rc; + + desc = get_key_description(vcssb, vce); + if (!desc) + return -ENOMEM; + + newkey = key_create_or_update( + make_key_ref(keyring, true), CERT_STORE_KEY_TYPE_NAME, + desc, (u8 *)vce + vce->vce_hdr.vc_offset, + vce->vce_hdr.vc_length, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + + rc = PTR_ERR_OR_ZERO(newkey); + if (rc) { + pr_dbf_msg("Couldn't create a key from Certificate Entry (%d)", rc); + rc = -ENOKEY; + goto out; + } + + key_ref_put(newkey); +out: + kfree(desc); + return rc; +} + +/* Get Verification Certificate Storage Size block with DIAG320 subcode2. */ +static int get_vcssb(struct vcssb *vcssb) +{ + int diag320_rc; + + memset(vcssb, 0, sizeof(*vcssb)); + vcssb->vcssb_length = VCSSB_LEN_BYTES; + diag320_rc = diag320(DIAG320_STORAGE, vcssb); + pr_dbf_vcssb(vcssb); + + if (diag320_rc != DIAG320_RC_OK) { + pr_dbf_msg("Diag 320 Subcode 1 returned bad RC: %04x", diag320_rc); + return -EIO; + } + if (vcssb->vcssb_length == VCSSB_LEN_NO_CERTS) { + pr_dbf_msg("No certificates available for current configuration"); + return -ENOKEY; + } + + return 0; +} + +static u32 get_4k_mult_vcb_size(struct vcssb *vcssb) +{ + return round_up(vcssb->max_single_vcb_length, PAGE_SIZE); +} + +/* Fill input fields of single-entry VCB that will be read by LPAR. */ +static void fill_vcb_input(struct vcssb *vcssb, struct vcb *vcb, u16 index) +{ + memset(vcb, 0, sizeof(*vcb)); + vcb->vcb_hdr.vcb_input_length = get_4k_mult_vcb_size(vcssb); + vcb->vcb_hdr.cs_token = vcssb->cs_token; + + /* Request single entry. */ + vcb->vcb_hdr.first_vc_index = index; + vcb->vcb_hdr.last_vc_index = index; +} + +static void extract_vce_from_sevcb(struct vcb *vcb, struct vce *vce) +{ + struct vce *extracted_vce; + + extracted_vce = (struct vce *)vcb->vcb_buf; + memcpy(vce, vcb->vcb_buf, extracted_vce->vce_hdr.vce_length); + pr_dbf_vce(vce); +} + +static int get_sevcb(struct vcssb *vcssb, u16 index, struct vcb *vcb) +{ + int rc, diag320_rc; + + fill_vcb_input(vcssb, vcb, index); + + diag320_rc = diag320(DIAG320_CERT_BLOCK, vcb); + pr_dbf_msg("Diag 320 Subcode2 RC %2x", diag320_rc); + pr_dbf_vcb(vcb); + + switch (diag320_rc) { + case DIAG320_RC_OK: + rc = 0; + if (vcb->vcb_hdr.vcb_output_length == VCB_LEN_NO_CERTS) { + pr_dbf_msg("No certificate entry for index %u", index); + rc = -ENOKEY; + } else if (vcb->vcb_hdr.remaining_vc_count != 0) { + /* Retry on insufficient space. */ + pr_dbf_msg("Couldn't get all requested certificates"); + rc = -EAGAIN; + } + break; + case DIAG320_RC_CS_NOMATCH: + pr_dbf_msg("Certificate Store token mismatch"); + rc = -EAGAIN; + break; + default: + pr_dbf_msg("Diag 320 Subcode2 returned bad rc (0x%4x)", diag320_rc); + rc = -EINVAL; + break; + } + + return rc; +} + +/* + * Allocate memory for single-entry VCB, get VCB via DIAG320 subcode 2 call, + * extract VCE and create a key from its' certificate. + */ +static int create_key_from_sevcb(struct vcssb *vcssb, u16 index, + struct key *keyring) +{ + struct vcb *vcb; + struct vce *vce; + int rc; + + rc = -ENOMEM; + vcb = vmalloc(get_4k_mult_vcb_size(vcssb)); + vce = vmalloc(vcssb->max_single_vcb_length - sizeof(vcb->vcb_hdr)); + if (!vcb || !vce) + goto out; + + rc = get_sevcb(vcssb, index, vcb); + if (rc) + goto out; + + extract_vce_from_sevcb(vcb, vce); + rc = check_certificate_valid(vce); + if (rc) + goto out; + + rc = create_key_from_vce(vcssb, vce, keyring); + if (rc) + goto out; + + pr_dbf_msg("Successfully created key from Certificate Entry %d", index); +out: + vfree(vce); + vfree(vcb); + return rc; +} + +/* + * Request a single-entry VCB for each VCE available for the partition. + * Create a key from it and link it to cert_store keyring. If no keys + * could be created (i.e. VCEs were invalid) return -ENOKEY. + */ +static int add_certificates_to_keyring(struct vcssb *vcssb, struct key *keyring) +{ + int rc, index, count, added; + + count = 0; + added = 0; + /* Certificate Store entries indices start with 1 and have no gaps. */ + for (index = 1; index < vcssb->total_vc_index_count + 1; index++) { + pr_dbf_msg("Creating key from VCE %u", index); + rc = create_key_from_sevcb(vcssb, index, keyring); + count++; + + if (rc == -EAGAIN) + return rc; + + if (rc) + pr_dbf_msg("Creating key from VCE %u failed (%d)", index, rc); + else + added++; + } + + if (added == 0) { + pr_dbf_msg("Processed %d entries. No keys created", count); + return -ENOKEY; + } + + pr_info("Added %d of %d keys to cert_store keyring", added, count); + + /* + * Do not allow to link more keys to certificate store keyring after all + * the VCEs were processed. + */ + rc = keyring_restrict(make_key_ref(keyring, true), NULL, NULL); + if (rc) + pr_dbf_msg("Failed to set restriction to cert_store keyring (%d)", rc); + + return 0; +} + +/* + * Check which DIAG320 subcodes are installed. + * Return -ENOENT if subcodes 1 or 2 are not available. + */ +static int query_diag320_subcodes(void) +{ + unsigned long ism[ISM_LEN_DWORDS]; + int rc; + + rc = diag320(0, ism); + if (rc != DIAG320_RC_OK) { + pr_dbf_msg("DIAG320 subcode query returned %04x", rc); + return -ENOENT; + } + + debug_text_event(cert_store_hexdump, 3, "DIAG320 Subcode 0"); + debug_event(cert_store_hexdump, 3, ism, sizeof(ism)); + + if (!test_bit_inv(1, ism) || !test_bit_inv(2, ism)) { + pr_dbf_msg("Not all required DIAG320 subcodes are installed"); + return -ENOENT; + } + + return 0; +} + +/* + * Check if Certificate Store is supported by the firmware and DIAG320 subcodes + * 1 and 2 are installed. Create cert_store keyring and link all certificates + * available for the current partition to it as "cert_store_key" type + * keys. On refresh or error invalidate cert_store keyring and destroy + * all keys of "cert_store_key" type. + */ +static int fill_cs_keyring(void) +{ + struct key *cs_keyring; + struct vcssb *vcssb; + int rc; + + rc = -ENOMEM; + vcssb = kmalloc(VCSSB_LEN_BYTES, GFP_KERNEL); + if (!vcssb) + goto cleanup_keys; + + rc = -ENOENT; + if (!sclp.has_diag320) { + pr_dbf_msg("Certificate Store is not supported"); + goto cleanup_keys; + } + + rc = query_diag320_subcodes(); + if (rc) + goto cleanup_keys; + + rc = get_vcssb(vcssb); + if (rc) + goto cleanup_keys; + + cs_keyring = create_cs_keyring(); + if (!cs_keyring) + goto cleanup_keys; + + rc = add_certificates_to_keyring(vcssb, cs_keyring); + if (rc) + goto cleanup_cs_keyring; + + goto out; + +cleanup_cs_keyring: + key_put(cs_keyring); +cleanup_keys: + cleanup_cs_keys(); +out: + kfree(vcssb); + return rc; +} + +static DEFINE_MUTEX(cs_refresh_lock); +static int cs_status_val = -1; + +static ssize_t cs_status_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + if (cs_status_val == -1) + return sysfs_emit(buf, "uninitialized\n"); + else if (cs_status_val == 0) + return sysfs_emit(buf, "ok\n"); + + return sysfs_emit(buf, "failed (%d)\n", cs_status_val); +} + +static struct kobj_attribute cs_status_attr = __ATTR_RO(cs_status); + +static ssize_t refresh_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int rc, retries; + + pr_dbf_msg("Refresh certificate store information requested"); + rc = mutex_lock_interruptible(&cs_refresh_lock); + if (rc) + return rc; + + for (retries = 0; retries < DIAG_MAX_RETRIES; retries++) { + /* Request certificates from certificate store. */ + rc = fill_cs_keyring(); + if (rc) + pr_dbf_msg("Failed to refresh certificate store information (%d)", rc); + if (rc != -EAGAIN) + break; + } + cs_status_val = rc; + mutex_unlock(&cs_refresh_lock); + + return rc ?: count; +} + +static struct kobj_attribute refresh_attr = __ATTR_WO(refresh); + +static const struct attribute *cert_store_attrs[] __initconst = { + &cs_status_attr.attr, + &refresh_attr.attr, + NULL, +}; + +static struct kobject *cert_store_kobj; + +static int __init cert_store_init(void) +{ + int rc = -ENOMEM; + + cert_store_dbf = debug_register("cert_store_msg", 10, 1, 64); + if (!cert_store_dbf) + goto cleanup_dbf; + + cert_store_hexdump = debug_register("cert_store_hexdump", 3, 1, 128); + if (!cert_store_hexdump) + goto cleanup_dbf; + + debug_register_view(cert_store_hexdump, &debug_hex_ascii_view); + debug_register_view(cert_store_dbf, &debug_sprintf_view); + + /* Create directory /sys/firmware/cert_store. */ + cert_store_kobj = kobject_create_and_add("cert_store", firmware_kobj); + if (!cert_store_kobj) + goto cleanup_dbf; + + rc = sysfs_create_files(cert_store_kobj, cert_store_attrs); + if (rc) + goto cleanup_kobj; + + register_key_type(&key_type_cert_store_key); + + return rc; + +cleanup_kobj: + kobject_put(cert_store_kobj); +cleanup_dbf: + debug_unregister(cert_store_dbf); + debug_unregister(cert_store_hexdump); + + return rc; +} +device_initcall(cert_store_init); diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index 82079f2d8583..cca56b649ca3 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -50,6 +50,7 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X304] = { .code = 0x304, .name = "Partition-Resource Service" }, [DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" }, [DIAG_STAT_X318] = { .code = 0x318, .name = "CP Name and Version Codes" }, + [DIAG_STAT_X320] = { .code = 0x320, .name = "Certificate Store" }, [DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" }, }; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index f480d6c7fd39..fdc8668f3fba 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -55,6 +55,7 @@ static void __init sclp_early_facilities_detect(void) S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_GUEST; if (sccb->cpuoff > 134) { sclp.has_diag318 = !!(sccb->byte_134 & 0x80); + sclp.has_diag320 = !!(sccb->byte_134 & 0x04); sclp.has_iplcc = !!(sccb->byte_134 & 0x02); } if (sccb->cpuoff > 137) { -- cgit v1.2.3 From c83cd4fe31d52bca0587370d9e98f00072aefa27 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 2 Jul 2023 21:20:09 +0200 Subject: s390/diag: handle diag 204 subcode 4 address correctly Diagnose 204 subcode 4 requires a real (physical) address, but a virtual address is passed to the inline assembly. Convert the address to a physical address for only this specific case. Acked-by: Alexander Gordeev Reviewed-by: Christian Borntraeger Signed-off-by: Heiko Carstens --- arch/s390/include/asm/diag.h | 2 ++ arch/s390/kernel/diag.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index fb5a886ff47f..bed804137537 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -109,6 +109,8 @@ enum diag204_sc { DIAG204_SUBC_STIB7 = 7 }; +#define DIAG204_SUBCODE_MASK 0xffff + /* The two available diag 204 data formats */ enum diag204_format { DIAG204_INFO_SIMPLE = 0, diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index cca56b649ca3..f3a0f39cbd6c 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -171,6 +171,8 @@ static inline int __diag204(unsigned long *subcode, unsigned long size, void *ad int diag204(unsigned long subcode, unsigned long size, void *addr) { diag_stat_inc(DIAG_STAT_X204); + if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4) + addr = (void *)__pa(addr); size = __diag204(&subcode, size, addr); if (subcode) return -1; -- cgit v1.2.3 From 86e74965bbdf534b9c7f1f678b963492de41276e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 3 Jul 2023 14:34:25 +0200 Subject: s390/sthyi: enforce 4k alignment of vmalloc'ed area vmalloc() does not guarantee any alignment, unless it is explicitly requested with e.g. __vmalloc_node(). Using diag204() with subcode 7 requires a 4k aligned virtual buffer. Therefore switch to __vmalloc_node(). Note: with the current vmalloc() implementation callers would still get a 4k aligned area, even though this is quite non-obvious looking at the code. So changing this in sthyi doesn't fix a real bug. It is just to make sure the code will not suffer from some obscure options, like it happened in the past with kmalloc() where debug options changed the assumed alignment of allocated memory areas. Acked-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/kernel/sthyi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c index 4d141e2c132e..98ebedbb5761 100644 --- a/arch/s390/kernel/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -317,7 +317,9 @@ static void fill_diag(struct sthyi_sctns *sctns) if (pages <= 0) return; - diag204_buf = vmalloc(array_size(pages, PAGE_SIZE)); + diag204_buf = __vmalloc_node(array_size(pages, PAGE_SIZE), + PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); if (!diag204_buf) return; -- cgit v1.2.3 From 5ac8c72462cdad56e37981eb2172c5baa1ea40d6 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Wed, 28 Jun 2023 12:36:08 +0200 Subject: s390/zcrypt: remove CEX2 and CEX3 device drivers Remove the legacy device driver code for CEX2 and CEX3 cards. The last machines which are able to handle CEX2 crypto cards are z10 EC first available 2008 and z10 BC first available 2009. The last machines able to handle a CEX3 crypto card are z196 first available 2010 and z114 first available 2011. Please note that this does not imply to drop CEX2 and CEX3 support in general. With older kernels on hardware up to the aforementioned machine models these crypto cards will get support by IBM. The removal of the CEX2 and CEX3 device drivers code opens up some simplifications, for example support for crypto cards without rng support can be removed also. Signed-off-by: Harald Freudenberger Acked-by: Heiko Carstens Signed-off-by: Heiko Carstens --- drivers/crypto/Kconfig | 7 +- drivers/s390/crypto/Makefile | 2 +- drivers/s390/crypto/ap_bus.c | 25 +- drivers/s390/crypto/ap_bus.h | 19 +- drivers/s390/crypto/ap_queue.c | 47 +--- drivers/s390/crypto/zcrypt_cex2a.c | 227 ------------------ drivers/s390/crypto/zcrypt_cex2a.h | 134 ----------- drivers/s390/crypto/zcrypt_cex2c.c | 421 --------------------------------- drivers/s390/crypto/zcrypt_cex2c.h | 18 -- drivers/s390/crypto/zcrypt_msgtype50.c | 64 +++-- drivers/s390/crypto/zcrypt_msgtype50.h | 3 +- drivers/s390/crypto/zcrypt_msgtype6.c | 14 +- 12 files changed, 42 insertions(+), 939 deletions(-) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 44e44b8d9ce6..c761952f0dc6 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -70,10 +70,9 @@ config ZCRYPT select HW_RANDOM help Select this option if you want to enable support for - s390 cryptographic adapters like: - + Crypto Express 2 up to 7 Coprocessor (CEXxC) - + Crypto Express 2 up to 7 Accelerator (CEXxA) - + Crypto Express 4 up to 7 EP11 Coprocessor (CEXxP) + s390 cryptographic adapters like Crypto Express 4 up + to 8 in Coprocessor (CEXxC), EP11 Coprocessor (CEXxP) + or Accelerator (CEXxA) mode. config ZCRYPT_DEBUG bool "Enable debug features for s390 cryptographic adapters" diff --git a/drivers/s390/crypto/Makefile b/drivers/s390/crypto/Makefile index 22d2db690cd3..0edacd101c12 100644 --- a/drivers/s390/crypto/Makefile +++ b/drivers/s390/crypto/Makefile @@ -11,7 +11,7 @@ zcrypt-objs += zcrypt_msgtype6.o zcrypt_msgtype50.o zcrypt-objs += zcrypt_ccamisc.o zcrypt_ep11misc.o obj-$(CONFIG_ZCRYPT) += zcrypt.o # adapter drivers depend on ap.o and zcrypt.o -obj-$(CONFIG_ZCRYPT) += zcrypt_cex2c.o zcrypt_cex2a.o zcrypt_cex4.o +obj-$(CONFIG_ZCRYPT) += zcrypt_cex4.o # pkey kernel module pkey-objs := pkey_api.o diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 420120be300f..b1d2fedea086 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Copyright IBM Corp. 2006, 2021 + * Copyright IBM Corp. 2006, 2023 * Author(s): Cornelia Huck * Martin Schwidefsky * Ralph Wuerthner @@ -387,23 +387,6 @@ static int ap_queue_info(ap_qid_t qid, int *q_type, unsigned int *q_fac, *q_ml = tapq_info.ml; *q_decfg = status.response_code == AP_RESPONSE_DECONFIGURED; *q_cstop = status.response_code == AP_RESPONSE_CHECKSTOPPED; - switch (*q_type) { - /* For CEX2 and CEX3 the available functions - * are not reflected by the facilities bits. - * Instead it is coded into the type. So here - * modify the function bits based on the type. - */ - case AP_DEVICE_TYPE_CEX2A: - case AP_DEVICE_TYPE_CEX3A: - *q_fac |= 0x08000000; - break; - case AP_DEVICE_TYPE_CEX2C: - case AP_DEVICE_TYPE_CEX3C: - *q_fac |= 0x10000000; - break; - default: - break; - } return 1; default: /* @@ -1678,8 +1661,8 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func) { int comp_type = 0; - /* < CEX2A is not supported */ - if (rawtype < AP_DEVICE_TYPE_CEX2A) { + /* < CEX4 is not supported */ + if (rawtype < AP_DEVICE_TYPE_CEX4) { AP_DBF_WARN("%s queue=%02x.%04x unsupported type %d\n", __func__, AP_QID_CARD(qid), AP_QID_QUEUE(qid), rawtype); @@ -1701,7 +1684,7 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func) apinfo.cat = AP_DEVICE_TYPE_CEX8; status = ap_qact(qid, 0, &apinfo); if (status.response_code == AP_RESPONSE_NORMAL && - apinfo.cat >= AP_DEVICE_TYPE_CEX2A && + apinfo.cat >= AP_DEVICE_TYPE_CEX4 && apinfo.cat <= AP_DEVICE_TYPE_CEX8) comp_type = apinfo.cat; } diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 0d7b7eb374ad..47bbe9babc59 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * Copyright IBM Corp. 2006, 2019 + * Copyright IBM Corp. 2006, 2023 * Author(s): Cornelia Huck * Martin Schwidefsky * Ralph Wuerthner @@ -67,15 +67,8 @@ static inline int ap_test_bit(unsigned int *ptr, unsigned int nr) #define AP_RESPONSE_INVALID_DOMAIN 0x42 /* - * Known device types + * Supported AP device types */ -#define AP_DEVICE_TYPE_PCICC 3 -#define AP_DEVICE_TYPE_PCICA 4 -#define AP_DEVICE_TYPE_PCIXCC 5 -#define AP_DEVICE_TYPE_CEX2A 6 -#define AP_DEVICE_TYPE_CEX2C 7 -#define AP_DEVICE_TYPE_CEX3A 8 -#define AP_DEVICE_TYPE_CEX3C 9 #define AP_DEVICE_TYPE_CEX4 10 #define AP_DEVICE_TYPE_CEX5 11 #define AP_DEVICE_TYPE_CEX6 12 @@ -272,14 +265,6 @@ static inline void ap_release_message(struct ap_message *ap_msg) kfree_sensitive(ap_msg->private); } -/* - * Note: don't use ap_send/ap_recv after using ap_queue_message - * for the first time. Otherwise the ap message queue will get - * confused. - */ -int ap_send(ap_qid_t qid, unsigned long psmid, void *msg, size_t msglen); -int ap_recv(ap_qid_t qid, unsigned long *psmid, void *msg, size_t msglen); - enum ap_sm_wait ap_sm_event(struct ap_queue *aq, enum ap_sm_event event); enum ap_sm_wait ap_sm_event_loop(struct ap_queue *aq, enum ap_sm_event event); diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index 30df83735adf..1336e632adc4 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright IBM Corp. 2016 + * Copyright IBM Corp. 2016, 2023 * Author(s): Martin Schwidefsky * * Adjunct processor bus, queue related code. @@ -93,51 +93,6 @@ __ap_send(ap_qid_t qid, unsigned long psmid, void *msg, size_t msglen, return ap_nqap(qid, psmid, msg, msglen); } -int ap_send(ap_qid_t qid, unsigned long psmid, void *msg, size_t msglen) -{ - struct ap_queue_status status; - - status = __ap_send(qid, psmid, msg, msglen, 0); - if (status.async) - return -EPERM; - switch (status.response_code) { - case AP_RESPONSE_NORMAL: - return 0; - case AP_RESPONSE_Q_FULL: - case AP_RESPONSE_RESET_IN_PROGRESS: - return -EBUSY; - case AP_RESPONSE_REQ_FAC_NOT_INST: - return -EINVAL; - default: /* Device is gone. */ - return -ENODEV; - } -} -EXPORT_SYMBOL(ap_send); - -int ap_recv(ap_qid_t qid, unsigned long *psmid, void *msg, size_t msglen) -{ - struct ap_queue_status status; - - if (!msg) - return -EINVAL; - status = ap_dqap(qid, psmid, msg, msglen, NULL, NULL, NULL); - if (status.async) - return -EPERM; - switch (status.response_code) { - case AP_RESPONSE_NORMAL: - return 0; - case AP_RESPONSE_NO_PENDING_REPLY: - if (status.queue_empty) - return -ENOENT; - return -EBUSY; - case AP_RESPONSE_RESET_IN_PROGRESS: - return -EBUSY; - default: - return -ENODEV; - } -} -EXPORT_SYMBOL(ap_recv); - /* State machine definitions and helpers */ static enum ap_sm_wait ap_sm_nop(struct ap_queue *aq) diff --git a/drivers/s390/crypto/zcrypt_cex2a.c b/drivers/s390/crypto/zcrypt_cex2a.c index 83f692c9c197..e69de29bb2d1 100644 --- a/drivers/s390/crypto/zcrypt_cex2a.c +++ b/drivers/s390/crypto/zcrypt_cex2a.c @@ -1,227 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Copyright IBM Corp. 2001, 2012 - * Author(s): Robert Burroughs - * Eric Rossman (edrossma@us.ibm.com) - * - * Hotplug & misc device support: Jochen Roehrig (roehrig@de.ibm.com) - * Major cleanup & driver split: Martin Schwidefsky - * Ralph Wuerthner - * MSGTYPE restruct: Holger Dengler - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "ap_bus.h" -#include "zcrypt_api.h" -#include "zcrypt_error.h" -#include "zcrypt_cex2a.h" -#include "zcrypt_msgtype50.h" - -#define CEX2A_MIN_MOD_SIZE 1 /* 8 bits */ -#define CEX2A_MAX_MOD_SIZE 256 /* 2048 bits */ -#define CEX3A_MIN_MOD_SIZE CEX2A_MIN_MOD_SIZE -#define CEX3A_MAX_MOD_SIZE 512 /* 4096 bits */ - -#define CEX2A_MAX_MESSAGE_SIZE 0x390 /* sizeof(struct type50_crb2_msg) */ -#define CEX2A_MAX_RESPONSE_SIZE 0x110 /* max outputdatalength + type80_hdr */ - -#define CEX3A_MAX_RESPONSE_SIZE 0x210 /* 512 bit modulus - * (max outputdatalength) + - * type80_hdr - */ -#define CEX3A_MAX_MESSAGE_SIZE sizeof(struct type50_crb3_msg) - -#define CEX2A_CLEANUP_TIME (15 * HZ) -#define CEX3A_CLEANUP_TIME CEX2A_CLEANUP_TIME - -MODULE_AUTHOR("IBM Corporation"); -MODULE_DESCRIPTION("CEX2A/CEX3A Cryptographic Coprocessor device driver, " \ - "Copyright IBM Corp. 2001, 2018"); -MODULE_LICENSE("GPL"); - -static struct ap_device_id zcrypt_cex2a_card_ids[] = { - { .dev_type = AP_DEVICE_TYPE_CEX2A, - .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, - { .dev_type = AP_DEVICE_TYPE_CEX3A, - .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, - { /* end of list */ }, -}; - -MODULE_DEVICE_TABLE(ap, zcrypt_cex2a_card_ids); - -static struct ap_device_id zcrypt_cex2a_queue_ids[] = { - { .dev_type = AP_DEVICE_TYPE_CEX2A, - .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, - { .dev_type = AP_DEVICE_TYPE_CEX3A, - .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, - { /* end of list */ }, -}; - -MODULE_DEVICE_TABLE(ap, zcrypt_cex2a_queue_ids); - -/* - * Probe function for CEX2A card devices. It always accepts the AP device - * since the bus_match already checked the card type. - * @ap_dev: pointer to the AP device. - */ -static int zcrypt_cex2a_card_probe(struct ap_device *ap_dev) -{ - /* - * Normalized speed ratings per crypto adapter - * MEX_1k, MEX_2k, MEX_4k, CRT_1k, CRT_2k, CRT_4k, RNG, SECKEY - */ - static const int CEX2A_SPEED_IDX[] = { - 800, 1000, 2000, 900, 1200, 2400, 0, 0}; - static const int CEX3A_SPEED_IDX[] = { - 400, 500, 1000, 450, 550, 1200, 0, 0}; - - struct ap_card *ac = to_ap_card(&ap_dev->device); - struct zcrypt_card *zc; - int rc = 0; - - zc = zcrypt_card_alloc(); - if (!zc) - return -ENOMEM; - zc->card = ac; - dev_set_drvdata(&ap_dev->device, zc); - - if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX2A) { - zc->min_mod_size = CEX2A_MIN_MOD_SIZE; - zc->max_mod_size = CEX2A_MAX_MOD_SIZE; - zc->speed_rating = CEX2A_SPEED_IDX; - zc->max_exp_bit_length = CEX2A_MAX_MOD_SIZE; - zc->type_string = "CEX2A"; - zc->user_space_type = ZCRYPT_CEX2A; - } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX3A) { - zc->min_mod_size = CEX2A_MIN_MOD_SIZE; - zc->max_mod_size = CEX2A_MAX_MOD_SIZE; - zc->max_exp_bit_length = CEX2A_MAX_MOD_SIZE; - if (ap_test_bit(&ac->functions, AP_FUNC_MEX4K) && - ap_test_bit(&ac->functions, AP_FUNC_CRT4K)) { - zc->max_mod_size = CEX3A_MAX_MOD_SIZE; - zc->max_exp_bit_length = CEX3A_MAX_MOD_SIZE; - } - zc->speed_rating = CEX3A_SPEED_IDX; - zc->type_string = "CEX3A"; - zc->user_space_type = ZCRYPT_CEX3A; - } else { - zcrypt_card_free(zc); - return -ENODEV; - } - zc->online = 1; - - rc = zcrypt_card_register(zc); - if (rc) - zcrypt_card_free(zc); - - return rc; -} - -/* - * This is called to remove the CEX2A card driver information - * if an AP card device is removed. - */ -static void zcrypt_cex2a_card_remove(struct ap_device *ap_dev) -{ - struct zcrypt_card *zc = dev_get_drvdata(&ap_dev->device); - - zcrypt_card_unregister(zc); -} - -static struct ap_driver zcrypt_cex2a_card_driver = { - .probe = zcrypt_cex2a_card_probe, - .remove = zcrypt_cex2a_card_remove, - .ids = zcrypt_cex2a_card_ids, - .flags = AP_DRIVER_FLAG_DEFAULT, -}; - -/* - * Probe function for CEX2A queue devices. It always accepts the AP device - * since the bus_match already checked the queue type. - * @ap_dev: pointer to the AP device. - */ -static int zcrypt_cex2a_queue_probe(struct ap_device *ap_dev) -{ - struct ap_queue *aq = to_ap_queue(&ap_dev->device); - struct zcrypt_queue *zq = NULL; - int rc; - - switch (ap_dev->device_type) { - case AP_DEVICE_TYPE_CEX2A: - zq = zcrypt_queue_alloc(CEX2A_MAX_RESPONSE_SIZE); - if (!zq) - return -ENOMEM; - break; - case AP_DEVICE_TYPE_CEX3A: - zq = zcrypt_queue_alloc(CEX3A_MAX_RESPONSE_SIZE); - if (!zq) - return -ENOMEM; - break; - } - if (!zq) - return -ENODEV; - zq->ops = zcrypt_msgtype(MSGTYPE50_NAME, MSGTYPE50_VARIANT_DEFAULT); - zq->queue = aq; - zq->online = 1; - atomic_set(&zq->load, 0); - ap_queue_init_state(aq); - ap_queue_init_reply(aq, &zq->reply); - aq->request_timeout = CEX2A_CLEANUP_TIME; - dev_set_drvdata(&ap_dev->device, zq); - rc = zcrypt_queue_register(zq); - if (rc) - zcrypt_queue_free(zq); - - return rc; -} - -/* - * This is called to remove the CEX2A queue driver information - * if an AP queue device is removed. - */ -static void zcrypt_cex2a_queue_remove(struct ap_device *ap_dev) -{ - struct zcrypt_queue *zq = dev_get_drvdata(&ap_dev->device); - - zcrypt_queue_unregister(zq); -} - -static struct ap_driver zcrypt_cex2a_queue_driver = { - .probe = zcrypt_cex2a_queue_probe, - .remove = zcrypt_cex2a_queue_remove, - .ids = zcrypt_cex2a_queue_ids, - .flags = AP_DRIVER_FLAG_DEFAULT, -}; - -int __init zcrypt_cex2a_init(void) -{ - int rc; - - rc = ap_driver_register(&zcrypt_cex2a_card_driver, - THIS_MODULE, "cex2acard"); - if (rc) - return rc; - - rc = ap_driver_register(&zcrypt_cex2a_queue_driver, - THIS_MODULE, "cex2aqueue"); - if (rc) - ap_driver_unregister(&zcrypt_cex2a_card_driver); - - return rc; -} - -void __exit zcrypt_cex2a_exit(void) -{ - ap_driver_unregister(&zcrypt_cex2a_queue_driver); - ap_driver_unregister(&zcrypt_cex2a_card_driver); -} - -module_init(zcrypt_cex2a_init); -module_exit(zcrypt_cex2a_exit); diff --git a/drivers/s390/crypto/zcrypt_cex2a.h b/drivers/s390/crypto/zcrypt_cex2a.h index 7842214d9d09..e69de29bb2d1 100644 --- a/drivers/s390/crypto/zcrypt_cex2a.h +++ b/drivers/s390/crypto/zcrypt_cex2a.h @@ -1,134 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * Copyright IBM Corp. 2001, 2006 - * Author(s): Robert Burroughs - * Eric Rossman (edrossma@us.ibm.com) - * - * Hotplug & misc device support: Jochen Roehrig (roehrig@de.ibm.com) - * Major cleanup & driver split: Martin Schwidefsky - */ - -#ifndef _ZCRYPT_CEX2A_H_ -#define _ZCRYPT_CEX2A_H_ - -/** - * The type 50 message family is associated with CEXxA cards. - * - * The four members of the family are described below. - * - * Note that all unsigned char arrays are right-justified and left-padded - * with zeroes. - * - * Note that all reserved fields must be zeroes. - */ -struct type50_hdr { - unsigned char reserved1; - unsigned char msg_type_code; /* 0x50 */ - unsigned short msg_len; - unsigned char reserved2; - unsigned char ignored; - unsigned short reserved3; -} __packed; - -#define TYPE50_TYPE_CODE 0x50 - -#define TYPE50_MEB1_FMT 0x0001 -#define TYPE50_MEB2_FMT 0x0002 -#define TYPE50_MEB3_FMT 0x0003 -#define TYPE50_CRB1_FMT 0x0011 -#define TYPE50_CRB2_FMT 0x0012 -#define TYPE50_CRB3_FMT 0x0013 - -/* Mod-Exp, with a small modulus */ -struct type50_meb1_msg { - struct type50_hdr header; - unsigned short keyblock_type; /* 0x0001 */ - unsigned char reserved[6]; - unsigned char exponent[128]; - unsigned char modulus[128]; - unsigned char message[128]; -} __packed; - -/* Mod-Exp, with a large modulus */ -struct type50_meb2_msg { - struct type50_hdr header; - unsigned short keyblock_type; /* 0x0002 */ - unsigned char reserved[6]; - unsigned char exponent[256]; - unsigned char modulus[256]; - unsigned char message[256]; -} __packed; - -/* Mod-Exp, with a larger modulus */ -struct type50_meb3_msg { - struct type50_hdr header; - unsigned short keyblock_type; /* 0x0003 */ - unsigned char reserved[6]; - unsigned char exponent[512]; - unsigned char modulus[512]; - unsigned char message[512]; -} __packed; - -/* CRT, with a small modulus */ -struct type50_crb1_msg { - struct type50_hdr header; - unsigned short keyblock_type; /* 0x0011 */ - unsigned char reserved[6]; - unsigned char p[64]; - unsigned char q[64]; - unsigned char dp[64]; - unsigned char dq[64]; - unsigned char u[64]; - unsigned char message[128]; -} __packed; - -/* CRT, with a large modulus */ -struct type50_crb2_msg { - struct type50_hdr header; - unsigned short keyblock_type; /* 0x0012 */ - unsigned char reserved[6]; - unsigned char p[128]; - unsigned char q[128]; - unsigned char dp[128]; - unsigned char dq[128]; - unsigned char u[128]; - unsigned char message[256]; -} __packed; - -/* CRT, with a larger modulus */ -struct type50_crb3_msg { - struct type50_hdr header; - unsigned short keyblock_type; /* 0x0013 */ - unsigned char reserved[6]; - unsigned char p[256]; - unsigned char q[256]; - unsigned char dp[256]; - unsigned char dq[256]; - unsigned char u[256]; - unsigned char message[512]; -} __packed; - -/** - * The type 80 response family is associated with a CEXxA cards. - * - * Note that all unsigned char arrays are right-justified and left-padded - * with zeroes. - * - * Note that all reserved fields must be zeroes. - */ - -#define TYPE80_RSP_CODE 0x80 - -struct type80_hdr { - unsigned char reserved1; - unsigned char type; /* 0x80 */ - unsigned short len; - unsigned char code; /* 0x00 */ - unsigned char reserved2[3]; - unsigned char reserved3[8]; -} __packed; - -int zcrypt_cex2a_init(void); -void zcrypt_cex2a_exit(void); - -#endif /* _ZCRYPT_CEX2A_H_ */ diff --git a/drivers/s390/crypto/zcrypt_cex2c.c b/drivers/s390/crypto/zcrypt_cex2c.c index 251b5bd3d19c..e69de29bb2d1 100644 --- a/drivers/s390/crypto/zcrypt_cex2c.c +++ b/drivers/s390/crypto/zcrypt_cex2c.c @@ -1,421 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Copyright IBM Corp. 2001, 2018 - * Author(s): Robert Burroughs - * Eric Rossman (edrossma@us.ibm.com) - * - * Hotplug & misc device support: Jochen Roehrig (roehrig@de.ibm.com) - * Major cleanup & driver split: Martin Schwidefsky - * Ralph Wuerthner - * MSGTYPE restruct: Holger Dengler - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ap_bus.h" -#include "zcrypt_api.h" -#include "zcrypt_error.h" -#include "zcrypt_msgtype6.h" -#include "zcrypt_cex2c.h" -#include "zcrypt_cca_key.h" -#include "zcrypt_ccamisc.h" - -#define CEX2C_MIN_MOD_SIZE 16 /* 128 bits */ -#define CEX2C_MAX_MOD_SIZE 256 /* 2048 bits */ -#define CEX3C_MIN_MOD_SIZE 16 /* 128 bits */ -#define CEX3C_MAX_MOD_SIZE 512 /* 4096 bits */ -#define CEX2C_MAX_XCRB_MESSAGE_SIZE (12 * 1024) -#define CEX2C_CLEANUP_TIME (15 * HZ) - -MODULE_AUTHOR("IBM Corporation"); -MODULE_DESCRIPTION("CEX2C/CEX3C Cryptographic Coprocessor device driver, " \ - "Copyright IBM Corp. 2001, 2018"); -MODULE_LICENSE("GPL"); - -static struct ap_device_id zcrypt_cex2c_card_ids[] = { - { .dev_type = AP_DEVICE_TYPE_CEX2C, - .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, - { .dev_type = AP_DEVICE_TYPE_CEX3C, - .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, - { /* end of list */ }, -}; - -MODULE_DEVICE_TABLE(ap, zcrypt_cex2c_card_ids); - -static struct ap_device_id zcrypt_cex2c_queue_ids[] = { - { .dev_type = AP_DEVICE_TYPE_CEX2C, - .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, - { .dev_type = AP_DEVICE_TYPE_CEX3C, - .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, - { /* end of list */ }, -}; - -MODULE_DEVICE_TABLE(ap, zcrypt_cex2c_queue_ids); - -/* - * CCA card additional device attributes - */ -static ssize_t cca_serialnr_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct zcrypt_card *zc = dev_get_drvdata(dev); - struct cca_info ci; - struct ap_card *ac = to_ap_card(dev); - - memset(&ci, 0, sizeof(ci)); - - if (ap_domain_index >= 0) - cca_get_info(ac->id, ap_domain_index, &ci, zc->online); - - return sysfs_emit(buf, "%s\n", ci.serial); -} - -static struct device_attribute dev_attr_cca_serialnr = - __ATTR(serialnr, 0444, cca_serialnr_show, NULL); - -static struct attribute *cca_card_attrs[] = { - &dev_attr_cca_serialnr.attr, - NULL, -}; - -static const struct attribute_group cca_card_attr_grp = { - .attrs = cca_card_attrs, -}; - - /* - * CCA queue additional device attributes - */ -static ssize_t cca_mkvps_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct zcrypt_queue *zq = dev_get_drvdata(dev); - int n = 0; - struct cca_info ci; - static const char * const cao_state[] = { "invalid", "valid" }; - static const char * const new_state[] = { "empty", "partial", "full" }; - - memset(&ci, 0, sizeof(ci)); - - cca_get_info(AP_QID_CARD(zq->queue->qid), - AP_QID_QUEUE(zq->queue->qid), - &ci, zq->online); - - if (ci.new_aes_mk_state >= '1' && ci.new_aes_mk_state <= '3') - n = sysfs_emit(buf, "AES NEW: %s 0x%016llx\n", - new_state[ci.new_aes_mk_state - '1'], - ci.new_aes_mkvp); - else - n = sysfs_emit(buf, "AES NEW: - -\n"); - - if (ci.cur_aes_mk_state >= '1' && ci.cur_aes_mk_state <= '2') - n += sysfs_emit_at(buf, n, "AES CUR: %s 0x%016llx\n", - cao_state[ci.cur_aes_mk_state - '1'], - ci.cur_aes_mkvp); - else - n += sysfs_emit_at(buf, n, "AES CUR: - -\n"); - - if (ci.old_aes_mk_state >= '1' && ci.old_aes_mk_state <= '2') - n += sysfs_emit_at(buf, n, "AES OLD: %s 0x%016llx\n", - cao_state[ci.old_aes_mk_state - '1'], - ci.old_aes_mkvp); - else - n += sysfs_emit_at(buf, n, "AES OLD: - -\n"); - - if (ci.new_apka_mk_state >= '1' && ci.new_apka_mk_state <= '3') - n += sysfs_emit_at(buf, n, "APKA NEW: %s 0x%016llx\n", - new_state[ci.new_apka_mk_state - '1'], - ci.new_apka_mkvp); - else - n += sysfs_emit_at(buf, n, "APKA NEW: - -\n"); - - if (ci.cur_apka_mk_state >= '1' && ci.cur_apka_mk_state <= '2') - n += sysfs_emit_at(buf, n, "APKA CUR: %s 0x%016llx\n", - cao_state[ci.cur_apka_mk_state - '1'], - ci.cur_apka_mkvp); - else - n += sysfs_emit_at(buf, n, "APKA CUR: - -\n"); - - if (ci.old_apka_mk_state >= '1' && ci.old_apka_mk_state <= '2') - n += sysfs_emit_at(buf, n, "APKA OLD: %s 0x%016llx\n", - cao_state[ci.old_apka_mk_state - '1'], - ci.old_apka_mkvp); - else - n += sysfs_emit_at(buf, n, "APKA OLD: - -\n"); - - return n; -} - -static struct device_attribute dev_attr_cca_mkvps = - __ATTR(mkvps, 0444, cca_mkvps_show, NULL); - -static struct attribute *cca_queue_attrs[] = { - &dev_attr_cca_mkvps.attr, - NULL, -}; - -static const struct attribute_group cca_queue_attr_grp = { - .attrs = cca_queue_attrs, -}; - -/* - * Large random number detection function. Its sends a message to a CEX2C/CEX3C - * card to find out if large random numbers are supported. - * @ap_dev: pointer to the AP device. - * - * Returns 1 if large random numbers are supported, 0 if not and < 0 on error. - */ -static int zcrypt_cex2c_rng_supported(struct ap_queue *aq) -{ - struct ap_message ap_msg; - unsigned long psmid; - unsigned int domain; - struct { - struct type86_hdr hdr; - struct type86_fmt2_ext fmt2; - struct CPRBX cprbx; - } __packed *reply; - struct { - struct type6_hdr hdr; - struct CPRBX cprbx; - char function_code[2]; - short int rule_length; - char rule[8]; - short int verb_length; - short int key_length; - } __packed *msg; - int rc, i; - - ap_init_message(&ap_msg); - ap_msg.msg = (void *)get_zeroed_page(GFP_KERNEL); - if (!ap_msg.msg) - return -ENOMEM; - ap_msg.bufsize = PAGE_SIZE; - - rng_type6cprb_msgx(&ap_msg, 4, &domain); - - msg = ap_msg.msg; - msg->cprbx.domain = AP_QID_QUEUE(aq->qid); - - rc = ap_send(aq->qid, 0x0102030405060708UL, ap_msg.msg, ap_msg.len); - if (rc) - goto out_free; - - /* Wait for the test message to complete. */ - for (i = 0; i < 2 * HZ; i++) { - msleep(1000 / HZ); - rc = ap_recv(aq->qid, &psmid, ap_msg.msg, ap_msg.bufsize); - if (rc == 0 && psmid == 0x0102030405060708UL) - break; - } - - if (i >= 2 * HZ) { - /* Got no answer. */ - rc = -ENODEV; - goto out_free; - } - - reply = ap_msg.msg; - if (reply->cprbx.ccp_rtcode == 0 && reply->cprbx.ccp_rscode == 0) - rc = 1; - else - rc = 0; -out_free: - free_page((unsigned long)ap_msg.msg); - return rc; -} - -/* - * Probe function for CEX2C/CEX3C card devices. It always accepts the - * AP device since the bus_match already checked the hardware type. - * @ap_dev: pointer to the AP card device. - */ -static int zcrypt_cex2c_card_probe(struct ap_device *ap_dev) -{ - /* - * Normalized speed ratings per crypto adapter - * MEX_1k, MEX_2k, MEX_4k, CRT_1k, CRT_2k, CRT_4k, RNG, SECKEY - */ - static const int CEX2C_SPEED_IDX[] = { - 1000, 1400, 2400, 1100, 1500, 2600, 100, 12}; - static const int CEX3C_SPEED_IDX[] = { - 500, 700, 1400, 550, 800, 1500, 80, 10}; - - struct ap_card *ac = to_ap_card(&ap_dev->device); - struct zcrypt_card *zc; - int rc = 0; - - zc = zcrypt_card_alloc(); - if (!zc) - return -ENOMEM; - zc->card = ac; - dev_set_drvdata(&ap_dev->device, zc); - switch (ac->ap_dev.device_type) { - case AP_DEVICE_TYPE_CEX2C: - zc->user_space_type = ZCRYPT_CEX2C; - zc->type_string = "CEX2C"; - zc->speed_rating = CEX2C_SPEED_IDX; - zc->min_mod_size = CEX2C_MIN_MOD_SIZE; - zc->max_mod_size = CEX2C_MAX_MOD_SIZE; - zc->max_exp_bit_length = CEX2C_MAX_MOD_SIZE; - break; - case AP_DEVICE_TYPE_CEX3C: - zc->user_space_type = ZCRYPT_CEX3C; - zc->type_string = "CEX3C"; - zc->speed_rating = CEX3C_SPEED_IDX; - zc->min_mod_size = CEX3C_MIN_MOD_SIZE; - zc->max_mod_size = CEX3C_MAX_MOD_SIZE; - zc->max_exp_bit_length = CEX3C_MAX_MOD_SIZE; - break; - default: - zcrypt_card_free(zc); - return -ENODEV; - } - zc->online = 1; - - rc = zcrypt_card_register(zc); - if (rc) { - zcrypt_card_free(zc); - return rc; - } - - if (ap_test_bit(&ac->functions, AP_FUNC_COPRO)) { - rc = sysfs_create_group(&ap_dev->device.kobj, - &cca_card_attr_grp); - if (rc) { - zcrypt_card_unregister(zc); - zcrypt_card_free(zc); - } - } - - return rc; -} - -/* - * This is called to remove the CEX2C/CEX3C card driver information - * if an AP card device is removed. - */ -static void zcrypt_cex2c_card_remove(struct ap_device *ap_dev) -{ - struct zcrypt_card *zc = dev_get_drvdata(&ap_dev->device); - struct ap_card *ac = to_ap_card(&ap_dev->device); - - if (ap_test_bit(&ac->functions, AP_FUNC_COPRO)) - sysfs_remove_group(&ap_dev->device.kobj, &cca_card_attr_grp); - - zcrypt_card_unregister(zc); -} - -static struct ap_driver zcrypt_cex2c_card_driver = { - .probe = zcrypt_cex2c_card_probe, - .remove = zcrypt_cex2c_card_remove, - .ids = zcrypt_cex2c_card_ids, - .flags = AP_DRIVER_FLAG_DEFAULT, -}; - -/* - * Probe function for CEX2C/CEX3C queue devices. It always accepts the - * AP device since the bus_match already checked the hardware type. - * @ap_dev: pointer to the AP card device. - */ -static int zcrypt_cex2c_queue_probe(struct ap_device *ap_dev) -{ - struct ap_queue *aq = to_ap_queue(&ap_dev->device); - struct zcrypt_queue *zq; - int rc; - - zq = zcrypt_queue_alloc(CEX2C_MAX_XCRB_MESSAGE_SIZE); - if (!zq) - return -ENOMEM; - zq->queue = aq; - zq->online = 1; - atomic_set(&zq->load, 0); - ap_rapq(aq->qid, 0); - rc = zcrypt_cex2c_rng_supported(aq); - if (rc < 0) { - zcrypt_queue_free(zq); - return rc; - } - if (rc) - zq->ops = zcrypt_msgtype(MSGTYPE06_NAME, - MSGTYPE06_VARIANT_DEFAULT); - else - zq->ops = zcrypt_msgtype(MSGTYPE06_NAME, - MSGTYPE06_VARIANT_NORNG); - ap_queue_init_state(aq); - ap_queue_init_reply(aq, &zq->reply); - aq->request_timeout = CEX2C_CLEANUP_TIME; - dev_set_drvdata(&ap_dev->device, zq); - rc = zcrypt_queue_register(zq); - if (rc) { - zcrypt_queue_free(zq); - return rc; - } - - if (ap_test_bit(&aq->card->functions, AP_FUNC_COPRO)) { - rc = sysfs_create_group(&ap_dev->device.kobj, - &cca_queue_attr_grp); - if (rc) { - zcrypt_queue_unregister(zq); - zcrypt_queue_free(zq); - } - } - - return rc; -} - -/* - * This is called to remove the CEX2C/CEX3C queue driver information - * if an AP queue device is removed. - */ -static void zcrypt_cex2c_queue_remove(struct ap_device *ap_dev) -{ - struct zcrypt_queue *zq = dev_get_drvdata(&ap_dev->device); - struct ap_queue *aq = to_ap_queue(&ap_dev->device); - - if (ap_test_bit(&aq->card->functions, AP_FUNC_COPRO)) - sysfs_remove_group(&ap_dev->device.kobj, &cca_queue_attr_grp); - - zcrypt_queue_unregister(zq); -} - -static struct ap_driver zcrypt_cex2c_queue_driver = { - .probe = zcrypt_cex2c_queue_probe, - .remove = zcrypt_cex2c_queue_remove, - .ids = zcrypt_cex2c_queue_ids, - .flags = AP_DRIVER_FLAG_DEFAULT, -}; - -int __init zcrypt_cex2c_init(void) -{ - int rc; - - rc = ap_driver_register(&zcrypt_cex2c_card_driver, - THIS_MODULE, "cex2card"); - if (rc) - return rc; - - rc = ap_driver_register(&zcrypt_cex2c_queue_driver, - THIS_MODULE, "cex2cqueue"); - if (rc) - ap_driver_unregister(&zcrypt_cex2c_card_driver); - - return rc; -} - -void zcrypt_cex2c_exit(void) -{ - ap_driver_unregister(&zcrypt_cex2c_queue_driver); - ap_driver_unregister(&zcrypt_cex2c_card_driver); -} - -module_init(zcrypt_cex2c_init); -module_exit(zcrypt_cex2c_exit); diff --git a/drivers/s390/crypto/zcrypt_cex2c.h b/drivers/s390/crypto/zcrypt_cex2c.h index 6ec405c2bec2..e69de29bb2d1 100644 --- a/drivers/s390/crypto/zcrypt_cex2c.h +++ b/drivers/s390/crypto/zcrypt_cex2c.h @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * Copyright IBM Corp. 2001, 2018 - * Author(s): Robert Burroughs - * Eric Rossman (edrossma@us.ibm.com) - * - * Hotplug & misc device support: Jochen Roehrig (roehrig@de.ibm.com) - * Major cleanup & driver split: Martin Schwidefsky - * MSGTYPE restruct: Holger Dengler - */ - -#ifndef _ZCRYPT_CEX2C_H_ -#define _ZCRYPT_CEX2C_H_ - -int zcrypt_cex2c_init(void); -void zcrypt_cex2c_exit(void); - -#endif /* _ZCRYPT_CEX2C_H_ */ diff --git a/drivers/s390/crypto/zcrypt_msgtype50.c b/drivers/s390/crypto/zcrypt_msgtype50.c index 51f8f7a463f7..2e155de8abe5 100644 --- a/drivers/s390/crypto/zcrypt_msgtype50.c +++ b/drivers/s390/crypto/zcrypt_msgtype50.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Copyright IBM Corp. 2001, 2012 + * Copyright IBM Corp. 2001, 2023 * Author(s): Robert Burroughs * Eric Rossman (edrossma@us.ibm.com) * @@ -28,15 +28,12 @@ /* >= CEX3A: 4096 bits */ #define CEX3A_MAX_MOD_SIZE 512 -/* CEX2A: max outputdatalength + type80_hdr */ -#define CEX2A_MAX_RESPONSE_SIZE 0x110 - /* >= CEX3A: 512 bit modulus, (max outputdatalength) + type80_hdr */ #define CEX3A_MAX_RESPONSE_SIZE 0x210 MODULE_AUTHOR("IBM Corporation"); MODULE_DESCRIPTION("Cryptographic Accelerator (message type 50), " \ - "Copyright IBM Corp. 2001, 2012"); + "Copyright IBM Corp. 2001, 2023"); MODULE_LICENSE("GPL"); /* @@ -366,20 +363,17 @@ static int convert_type80(struct zcrypt_queue *zq, ap_send_online_uevent(&zq->queue->ap_dev, zq->online); return -EAGAIN; } - if (zq->zcard->user_space_type == ZCRYPT_CEX2A) - BUG_ON(t80h->len > CEX2A_MAX_RESPONSE_SIZE); - else - BUG_ON(t80h->len > CEX3A_MAX_RESPONSE_SIZE); + BUG_ON(t80h->len > CEX3A_MAX_RESPONSE_SIZE); data = reply->msg + t80h->len - outputdatalength; if (copy_to_user(outputdata, data, outputdatalength)) return -EFAULT; return 0; } -static int convert_response_cex2a(struct zcrypt_queue *zq, - struct ap_message *reply, - char __user *outputdata, - unsigned int outputdatalength) +static int convert_response(struct zcrypt_queue *zq, + struct ap_message *reply, + char __user *outputdata, + unsigned int outputdatalength) { /* Response type byte is the second byte in the response. */ unsigned char rtype = ((unsigned char *)reply->msg)[1]; @@ -414,9 +408,9 @@ static int convert_response_cex2a(struct zcrypt_queue *zq, * @msg: pointer to the AP message * @reply: pointer to the AP reply message */ -static void zcrypt_cex2a_receive(struct ap_queue *aq, - struct ap_message *msg, - struct ap_message *reply) +static void zcrypt_msgtype50_receive(struct ap_queue *aq, + struct ap_message *msg, + struct ap_message *reply) { static struct error_hdr error_reply = { .type = TYPE82_RSP_CODE, @@ -456,19 +450,18 @@ static atomic_t zcrypt_step = ATOMIC_INIT(0); * CEXxA device to the request distributor * @mex: pointer to the modexpo request buffer */ -static long zcrypt_cex2a_modexpo(struct zcrypt_queue *zq, - struct ica_rsa_modexpo *mex, - struct ap_message *ap_msg) +static long zcrypt_msgtype50_modexpo(struct zcrypt_queue *zq, + struct ica_rsa_modexpo *mex, + struct ap_message *ap_msg) { struct completion work; int rc; - ap_msg->bufsize = (zq->zcard->user_space_type == ZCRYPT_CEX2A) ? - MSGTYPE50_CRB2_MAX_MSG_SIZE : MSGTYPE50_CRB3_MAX_MSG_SIZE; + ap_msg->bufsize = MSGTYPE50_CRB3_MAX_MSG_SIZE; ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL); if (!ap_msg->msg) return -ENOMEM; - ap_msg->receive = zcrypt_cex2a_receive; + ap_msg->receive = zcrypt_msgtype50_receive; ap_msg->psmid = (((unsigned long)current->pid) << 32) + atomic_inc_return(&zcrypt_step); ap_msg->private = &work; @@ -483,9 +476,9 @@ static long zcrypt_cex2a_modexpo(struct zcrypt_queue *zq, if (rc == 0) { rc = ap_msg->rc; if (rc == 0) - rc = convert_response_cex2a(zq, ap_msg, - mex->outputdata, - mex->outputdatalength); + rc = convert_response(zq, ap_msg, + mex->outputdata, + mex->outputdatalength); } else { /* Signal pending. */ ap_cancel_message(zq->queue, ap_msg); @@ -507,19 +500,18 @@ out: * CEXxA device to the request distributor * @crt: pointer to the modexpoc_crt request buffer */ -static long zcrypt_cex2a_modexpo_crt(struct zcrypt_queue *zq, - struct ica_rsa_modexpo_crt *crt, - struct ap_message *ap_msg) +static long zcrypt_msgtype50_modexpo_crt(struct zcrypt_queue *zq, + struct ica_rsa_modexpo_crt *crt, + struct ap_message *ap_msg) { struct completion work; int rc; - ap_msg->bufsize = (zq->zcard->user_space_type == ZCRYPT_CEX2A) ? - MSGTYPE50_CRB2_MAX_MSG_SIZE : MSGTYPE50_CRB3_MAX_MSG_SIZE; + ap_msg->bufsize = MSGTYPE50_CRB3_MAX_MSG_SIZE; ap_msg->msg = kmalloc(ap_msg->bufsize, GFP_KERNEL); if (!ap_msg->msg) return -ENOMEM; - ap_msg->receive = zcrypt_cex2a_receive; + ap_msg->receive = zcrypt_msgtype50_receive; ap_msg->psmid = (((unsigned long)current->pid) << 32) + atomic_inc_return(&zcrypt_step); ap_msg->private = &work; @@ -534,9 +526,9 @@ static long zcrypt_cex2a_modexpo_crt(struct zcrypt_queue *zq, if (rc == 0) { rc = ap_msg->rc; if (rc == 0) - rc = convert_response_cex2a(zq, ap_msg, - crt->outputdata, - crt->outputdatalength); + rc = convert_response(zq, ap_msg, + crt->outputdata, + crt->outputdatalength); } else { /* Signal pending. */ ap_cancel_message(zq->queue, ap_msg); @@ -555,8 +547,8 @@ out: * The crypto operations for message type 50. */ static struct zcrypt_ops zcrypt_msgtype50_ops = { - .rsa_modexpo = zcrypt_cex2a_modexpo, - .rsa_modexpo_crt = zcrypt_cex2a_modexpo_crt, + .rsa_modexpo = zcrypt_msgtype50_modexpo, + .rsa_modexpo_crt = zcrypt_msgtype50_modexpo_crt, .owner = THIS_MODULE, .name = MSGTYPE50_NAME, .variant = MSGTYPE50_VARIANT_DEFAULT, diff --git a/drivers/s390/crypto/zcrypt_msgtype50.h b/drivers/s390/crypto/zcrypt_msgtype50.h index eb49f06bed29..323e93b90b12 100644 --- a/drivers/s390/crypto/zcrypt_msgtype50.h +++ b/drivers/s390/crypto/zcrypt_msgtype50.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * Copyright IBM Corp. 2001, 2012 + * Copyright IBM Corp. 2001, 2023 * Author(s): Robert Burroughs * Eric Rossman (edrossma@us.ibm.com) * @@ -15,7 +15,6 @@ #define MSGTYPE50_NAME "zcrypt_msgtype50" #define MSGTYPE50_VARIANT_DEFAULT 0 -#define MSGTYPE50_CRB2_MAX_MSG_SIZE 0x390 /* sizeof(struct type50_crb2_msg) */ #define MSGTYPE50_CRB3_MAX_MSG_SIZE 0x710 /* sizeof(struct type50_crb3_msg) */ #define MSGTYPE_ADJUSTMENT 0x08 /* type04 extension (not needed in type50) */ diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c index e668ff5eb384..3c53abbdc342 100644 --- a/drivers/s390/crypto/zcrypt_msgtype6.c +++ b/drivers/s390/crypto/zcrypt_msgtype6.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Copyright IBM Corp. 2001, 2022 + * Copyright IBM Corp. 2001, 2023 * Author(s): Robert Burroughs * Eric Rossman (edrossma@us.ibm.com) * @@ -42,7 +42,7 @@ struct response_type { MODULE_AUTHOR("IBM Corporation"); MODULE_DESCRIPTION("Cryptographic Coprocessor (message type 6), " \ - "Copyright IBM Corp. 2001, 2012"); + "Copyright IBM Corp. 2001, 2023"); MODULE_LICENSE("GPL"); struct function_and_rules_block { @@ -1348,14 +1348,6 @@ out: /* * The crypto operations for a CEXxC card. */ -static struct zcrypt_ops zcrypt_msgtype6_norng_ops = { - .owner = THIS_MODULE, - .name = MSGTYPE06_NAME, - .variant = MSGTYPE06_VARIANT_NORNG, - .rsa_modexpo = zcrypt_msgtype6_modexpo, - .rsa_modexpo_crt = zcrypt_msgtype6_modexpo_crt, - .send_cprb = zcrypt_msgtype6_send_cprb, -}; static struct zcrypt_ops zcrypt_msgtype6_ops = { .owner = THIS_MODULE, @@ -1378,14 +1370,12 @@ static struct zcrypt_ops zcrypt_msgtype6_ep11_ops = { void __init zcrypt_msgtype6_init(void) { - zcrypt_msgtype_register(&zcrypt_msgtype6_norng_ops); zcrypt_msgtype_register(&zcrypt_msgtype6_ops); zcrypt_msgtype_register(&zcrypt_msgtype6_ep11_ops); } void __exit zcrypt_msgtype6_exit(void) { - zcrypt_msgtype_unregister(&zcrypt_msgtype6_norng_ops); zcrypt_msgtype_unregister(&zcrypt_msgtype6_ops); zcrypt_msgtype_unregister(&zcrypt_msgtype6_ep11_ops); } -- cgit v1.2.3 From 83f95671943e6394eda4d20fa9458d4a2ae13c5c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 4 Jul 2023 15:47:11 +0200 Subject: s390/hypfs: simplify memory allocation Simplify memory allocation for diagnose 204 memory buffer: - allocate with __vmalloc_node() to enure page alignment - allocate real / physical memory area also within vmalloc area and handle vmalloc to real / physical address translation within diag204(). Acked-by: Alexander Gordeev Reviewed-by: Mete Durlu Signed-off-by: Heiko Carstens --- arch/s390/hypfs/hypfs_diag.c | 40 ++++++++-------------------------------- arch/s390/kernel/diag.c | 24 ++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 34 deletions(-) diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c index c3be533c4cd3..c8083dc08db3 100644 --- a/arch/s390/hypfs/hypfs_diag.c +++ b/arch/s390/hypfs/hypfs_diag.c @@ -29,7 +29,6 @@ static enum diag204_sc diag204_store_sc; /* used subcode for store */ static enum diag204_format diag204_info_type; /* used diag 204 data format */ static void *diag204_buf; /* 4K aligned buffer for diag204 data */ -static void *diag204_buf_vmalloc; /* vmalloc pointer for diag204 data */ static int diag204_buf_pages; /* number of pages for diag204 data */ static struct dentry *dbfs_d204_file; @@ -212,14 +211,7 @@ static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) static void diag204_free_buffer(void) { - if (!diag204_buf) - return; - if (diag204_buf_vmalloc) { - vfree(diag204_buf_vmalloc); - diag204_buf_vmalloc = NULL; - } else { - free_pages((unsigned long) diag204_buf, 0); - } + vfree(diag204_buf); diag204_buf = NULL; } @@ -228,26 +220,6 @@ static void *page_align_ptr(void *ptr) return (void *) PAGE_ALIGN((unsigned long) ptr); } -static void *diag204_alloc_vbuf(int pages) -{ - /* The buffer has to be page aligned! */ - diag204_buf_vmalloc = vmalloc(array_size(PAGE_SIZE, (pages + 1))); - if (!diag204_buf_vmalloc) - return ERR_PTR(-ENOMEM); - diag204_buf = page_align_ptr(diag204_buf_vmalloc); - diag204_buf_pages = pages; - return diag204_buf; -} - -static void *diag204_alloc_rbuf(void) -{ - diag204_buf = (void*)__get_free_pages(GFP_KERNEL,0); - if (!diag204_buf) - return ERR_PTR(-ENOMEM); - diag204_buf_pages = 1; - return diag204_buf; -} - static void *diag204_get_buffer(enum diag204_format fmt, int *pages) { if (diag204_buf) { @@ -256,15 +228,19 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages) } if (fmt == DIAG204_INFO_SIMPLE) { *pages = 1; - return diag204_alloc_rbuf(); } else {/* DIAG204_INFO_EXT */ *pages = diag204((unsigned long)DIAG204_SUBC_RSI | (unsigned long)DIAG204_INFO_EXT, 0, NULL); if (*pages <= 0) return ERR_PTR(-ENOSYS); - else - return diag204_alloc_vbuf(*pages); } + diag204_buf = __vmalloc_node(array_size(*pages, PAGE_SIZE), + PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); + if (!diag204_buf) + return ERR_PTR(-ENOMEM); + diag204_buf_pages = *pages; + return diag204_buf; } /* diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index f3a0f39cbd6c..f287713baf6d 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -168,11 +169,30 @@ static inline int __diag204(unsigned long *subcode, unsigned long size, void *ad return rp.odd; } +/** + * diag204() - Issue diagnose 204 call. + * @subcode: Subcode of diagnose 204 to be executed. + * @size: Size of area in pages which @area points to, if given. + * @addr: Vmalloc'ed memory area where the result is written to. + * + * Execute diagnose 204 with the given subcode and write the result to the + * memory area specified with @addr. For subcodes which do not write a + * result to memory both @size and @addr must be zero. If @addr is + * specified it must be page aligned and must have been allocated with + * vmalloc(). Conversion to real / physical addresses will be handled by + * this function if required. + */ int diag204(unsigned long subcode, unsigned long size, void *addr) { - diag_stat_inc(DIAG_STAT_X204); + if (addr) { + if (WARN_ON_ONCE(!is_vmalloc_addr(addr))) + return -1; + if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)addr, PAGE_SIZE))) + return -1; + } if ((subcode & DIAG204_SUBCODE_MASK) == DIAG204_SUBC_STIB4) - addr = (void *)__pa(addr); + addr = (void *)pfn_to_phys(vmalloc_to_pfn(addr)); + diag_stat_inc(DIAG_STAT_X204); size = __diag204(&subcode, size, addr); if (subcode) return -1; -- cgit v1.2.3 From b7857acc1b1105da5f088fe2593f1a6e3a3d47ce Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 4 Jul 2023 15:47:12 +0200 Subject: s390/hypfs: remove open-coded PTR_ALIGN() Get rid of page_align_ptr() and use PTR_ALIGN() instead. Acked-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/hypfs/hypfs_diag.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c index c8083dc08db3..889d83c77826 100644 --- a/arch/s390/hypfs/hypfs_diag.c +++ b/arch/s390/hypfs/hypfs_diag.c @@ -215,11 +215,6 @@ static void diag204_free_buffer(void) diag204_buf = NULL; } -static void *page_align_ptr(void *ptr) -{ - return (void *) PAGE_ALIGN((unsigned long) ptr); -} - static void *diag204_get_buffer(enum diag204_format fmt, int *pages) { if (diag204_buf) { @@ -379,7 +374,7 @@ static int dbfs_d204_create(void **data, void **data_free_ptr, size_t *size) base = vzalloc(buf_size); if (!base) return -ENOMEM; - d204 = page_align_ptr(base + sizeof(d204->hdr)) - sizeof(d204->hdr); + d204 = PTR_ALIGN(base + sizeof(d204->hdr), PAGE_SIZE) - sizeof(d204->hdr); rc = diag204_do_store(d204->buf, diag204_buf_pages); if (rc) { vfree(base); -- cgit v1.2.3 From 3325b4d85799957aa53514e69bed5c9df7771caf Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 4 Jul 2023 15:47:13 +0200 Subject: s390/hypfs: factor out filesystem code The s390_hypfs filesystem is deprecated and shouldn't be used due to its rather odd semantics. It creates a whole directory structure with static file contents so a user can read a consistent state while within that directory. Writing to its update attribute will remove and rebuild nearly the whole filesystem, so that again a user can read a consistent state, even if multiple files need to be read. Given that this wastes a lot of CPU cycles, and involves a lot of code, binary interfaces have been added quite a couple of years ago, which simply pass the binary data to user space, and let user space decode the data. This is the preferred and only way how the data should be retrieved. The assumption is that there are no users of the s390_hypfs filesystem. However instead of just removing the code, and having to revert in case there are actually users, factor the filesystem code out and make it only available via a new config option. This config option is supposed to be disabled. If it turns out there are no complaints the filesystem code can be removed probably in a couple of years. Acked-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/Kbuild | 2 +- arch/s390/Kconfig | 15 +- arch/s390/hypfs/Makefile | 11 +- arch/s390/hypfs/hypfs.h | 10 +- arch/s390/hypfs/hypfs_dbfs.c | 31 +++- arch/s390/hypfs/hypfs_diag.c | 400 ++-------------------------------------- arch/s390/hypfs/hypfs_diag.h | 35 ++++ arch/s390/hypfs/hypfs_diag_fs.c | 393 +++++++++++++++++++++++++++++++++++++++ arch/s390/hypfs/hypfs_vm.c | 175 ++---------------- arch/s390/hypfs/hypfs_vm.h | 50 +++++ arch/s390/hypfs/hypfs_vm_fs.c | 139 ++++++++++++++ arch/s390/hypfs/inode.c | 35 +--- 12 files changed, 709 insertions(+), 587 deletions(-) create mode 100644 arch/s390/hypfs/hypfs_diag.h create mode 100644 arch/s390/hypfs/hypfs_diag_fs.c create mode 100644 arch/s390/hypfs/hypfs_vm.h create mode 100644 arch/s390/hypfs/hypfs_vm_fs.c diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild index 76e362277179..8e4d74f51115 100644 --- a/arch/s390/Kbuild +++ b/arch/s390/Kbuild @@ -3,7 +3,7 @@ obj-y += kernel/ obj-y += mm/ obj-$(CONFIG_KVM) += kvm/ obj-y += crypto/ -obj-$(CONFIG_S390_HYPFS_FS) += hypfs/ +obj-$(CONFIG_S390_HYPFS) += hypfs/ obj-$(CONFIG_APPLDATA_BASE) += appldata/ obj-y += net/ obj-$(CONFIG_PCI) += pci/ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c0afca69904e..ea0e8f34eb0d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -877,13 +877,24 @@ config APPLDATA_NET_SUM This can also be compiled as a module, which will be called appldata_net_sum.o. -config S390_HYPFS_FS +config S390_HYPFS def_bool y + prompt "s390 hypervisor information" + help + This provides several binary files at (debugfs)/s390_hypfs/ to + provide accounting information in an s390 hypervisor environment. + +config S390_HYPFS_FS + def_bool n prompt "s390 hypervisor file system support" select SYS_HYPERVISOR + depends on S390_HYPFS help This is a virtual file system intended to provide accounting - information in an s390 hypervisor environment. + information in an s390 hypervisor environment. This file system + is deprecated and should not be used. + + Say N if you are unsure. source "arch/s390/kvm/Kconfig" diff --git a/arch/s390/hypfs/Makefile b/arch/s390/hypfs/Makefile index 06f601509ce9..c34854d298f8 100644 --- a/arch/s390/hypfs/Makefile +++ b/arch/s390/hypfs/Makefile @@ -3,7 +3,12 @@ # Makefile for the linux hypfs filesystem routines. # -obj-$(CONFIG_S390_HYPFS_FS) += s390_hypfs.o +obj-$(CONFIG_S390_HYPFS) += hypfs_dbfs.o +obj-$(CONFIG_S390_HYPFS) += hypfs_diag.o +obj-$(CONFIG_S390_HYPFS) += hypfs_diag0c.o +obj-$(CONFIG_S390_HYPFS) += hypfs_sprp.o +obj-$(CONFIG_S390_HYPFS) += hypfs_vm.o -s390_hypfs-objs := inode.o hypfs_diag.o hypfs_vm.o hypfs_dbfs.o hypfs_sprp.o -s390_hypfs-objs += hypfs_diag0c.o +obj-$(CONFIG_S390_HYPFS_FS) += hypfs_diag_fs.o +obj-$(CONFIG_S390_HYPFS_FS) += hypfs_vm_fs.o +obj-$(CONFIG_S390_HYPFS_FS) += inode.o diff --git a/arch/s390/hypfs/hypfs.h b/arch/s390/hypfs/hypfs.h index 05f3f9aee5fc..65f4036fd541 100644 --- a/arch/s390/hypfs/hypfs.h +++ b/arch/s390/hypfs/hypfs.h @@ -46,6 +46,15 @@ void hypfs_diag0c_exit(void); void hypfs_sprp_init(void); void hypfs_sprp_exit(void); +int __hypfs_fs_init(void); + +static inline int hypfs_fs_init(void) +{ + if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) + return __hypfs_fs_init(); + return 0; +} + /* debugfs interface */ struct hypfs_dbfs_file; @@ -69,7 +78,6 @@ struct hypfs_dbfs_file { struct dentry *dentry; }; -extern void hypfs_dbfs_init(void); extern void hypfs_dbfs_exit(void); extern void hypfs_dbfs_create_file(struct hypfs_dbfs_file *df); extern void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df); diff --git a/arch/s390/hypfs/hypfs_dbfs.c b/arch/s390/hypfs/hypfs_dbfs.c index f4c7dbfaf8ee..4024599eb448 100644 --- a/arch/s390/hypfs/hypfs_dbfs.c +++ b/arch/s390/hypfs/hypfs_dbfs.c @@ -90,12 +90,33 @@ void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df) debugfs_remove(df->dentry); } -void hypfs_dbfs_init(void) +static int __init hypfs_dbfs_init(void) { - dbfs_dir = debugfs_create_dir("s390_hypfs", NULL); -} + int rc = -ENODATA; -void hypfs_dbfs_exit(void) -{ + dbfs_dir = debugfs_create_dir("s390_hypfs", NULL); + if (hypfs_diag_init()) + goto fail_dbfs_exit; + if (hypfs_vm_init()) + goto fail_hypfs_diag_exit; + hypfs_sprp_init(); + if (hypfs_diag0c_init()) + goto fail_hypfs_sprp_exit; + rc = hypfs_fs_init(); + if (rc) + goto fail_hypfs_diag0c_exit; + return 0; + +fail_hypfs_diag0c_exit: + hypfs_diag0c_exit(); +fail_hypfs_sprp_exit: + hypfs_sprp_exit(); + hypfs_vm_exit(); +fail_hypfs_diag_exit: + hypfs_diag_exit(); + pr_err("Initialization of hypfs failed with rc=%i\n", rc); +fail_dbfs_exit: debugfs_remove(dbfs_dir); + return rc; } +device_initcall(hypfs_dbfs_init) diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c index 889d83c77826..ea4c436f86a0 100644 --- a/arch/s390/hypfs/hypfs_diag.c +++ b/arch/s390/hypfs/hypfs_diag.c @@ -18,13 +18,11 @@ #include #include #include +#include "hypfs_diag.h" #include "hypfs.h" -#define TMP_SIZE 64 /* size of temporary buffers */ - #define DBFS_D204_HDR_VERSION 0 -static char *diag224_cpu_names; /* diag 224 name table */ static enum diag204_sc diag204_store_sc; /* used subcode for store */ static enum diag204_format diag204_info_type; /* used diag 204 data format */ @@ -33,172 +31,14 @@ static int diag204_buf_pages; /* number of pages for diag204 data */ static struct dentry *dbfs_d204_file; -/* - * DIAG 204 member access functions. - * - * Since we have two different diag 204 data formats for old and new s390 - * machines, we do not access the structs directly, but use getter functions for - * each struct member instead. This should make the code more readable. - */ - -/* Time information block */ - -static inline int info_blk_hdr__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_info_blk_hdr); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_info_blk_hdr); -} - -static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_info_blk_hdr *)hdr)->npar; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_info_blk_hdr *)hdr)->npar; -} - -static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_info_blk_hdr *)hdr)->flags; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_info_blk_hdr *)hdr)->flags; -} - -/* Partition header */ - -static inline int part_hdr__size(enum diag204_format type) +enum diag204_format diag204_get_info_type(void) { - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_part_hdr); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_part_hdr); + return diag204_info_type; } -static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr) +static void diag204_set_info_type(enum diag204_format type) { - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_part_hdr *)hdr)->cpus; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_part_hdr *)hdr)->rcpus; -} - -static inline void part_hdr__part_name(enum diag204_format type, void *hdr, - char *name) -{ - if (type == DIAG204_INFO_SIMPLE) - memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name, - DIAG204_LPAR_NAME_LEN); - else /* DIAG204_INFO_EXT */ - memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name, - DIAG204_LPAR_NAME_LEN); - EBCASC(name, DIAG204_LPAR_NAME_LEN); - name[DIAG204_LPAR_NAME_LEN] = 0; - strim(name); -} - -/* CPU info block */ - -static inline int cpu_info__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_cpu_info); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_cpu_info); -} - -static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_cpu_info *)hdr)->ctidx; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->ctidx; -} - -static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_cpu_info *)hdr)->cpu_addr; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->cpu_addr; -} - -static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_cpu_info *)hdr)->acc_time; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->acc_time; -} - -static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_cpu_info *)hdr)->lp_time; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->lp_time; -} - -static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return 0; /* online_time not available in simple info */ - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_cpu_info *)hdr)->online_time; -} - -/* Physical header */ - -static inline int phys_hdr__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_phys_hdr); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_phys_hdr); -} - -static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_phys_hdr *)hdr)->cpus; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_phys_hdr *)hdr)->cpus; -} - -/* Physical CPU info block */ - -static inline int phys_cpu__size(enum diag204_format type) -{ - if (type == DIAG204_INFO_SIMPLE) - return sizeof(struct diag204_phys_cpu); - else /* DIAG204_INFO_EXT */ - return sizeof(struct diag204_x_phys_cpu); -} - -static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_phys_cpu *)hdr)->cpu_addr; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr; -} - -static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_phys_cpu *)hdr)->mgm_time; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_phys_cpu *)hdr)->mgm_time; -} - -static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) -{ - if (type == DIAG204_INFO_SIMPLE) - return ((struct diag204_phys_cpu *)hdr)->ctidx; - else /* DIAG204_INFO_EXT */ - return ((struct diag204_x_phys_cpu *)hdr)->ctidx; + diag204_info_type = type; } /* Diagnose 204 functions */ @@ -215,7 +55,7 @@ static void diag204_free_buffer(void) diag204_buf = NULL; } -static void *diag204_get_buffer(enum diag204_format fmt, int *pages) +void *diag204_get_buffer(enum diag204_format fmt, int *pages) { if (diag204_buf) { *pages = diag204_buf_pages; @@ -262,13 +102,13 @@ static int diag204_probe(void) if (diag204((unsigned long)DIAG204_SUBC_STIB7 | (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { diag204_store_sc = DIAG204_SUBC_STIB7; - diag204_info_type = DIAG204_INFO_EXT; + diag204_set_info_type(DIAG204_INFO_EXT); goto out; } if (diag204((unsigned long)DIAG204_SUBC_STIB6 | (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { diag204_store_sc = DIAG204_SUBC_STIB6; - diag204_info_type = DIAG204_INFO_EXT; + diag204_set_info_type(DIAG204_INFO_EXT); goto out; } diag204_free_buffer(); @@ -284,7 +124,7 @@ static int diag204_probe(void) if (diag204((unsigned long)DIAG204_SUBC_STIB4 | (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) { diag204_store_sc = DIAG204_SUBC_STIB4; - diag204_info_type = DIAG204_INFO_SIMPLE; + diag204_set_info_type(DIAG204_INFO_SIMPLE); goto out; } else { rc = -ENOSYS; @@ -298,60 +138,15 @@ fail_alloc: return rc; } -static int diag204_do_store(void *buf, int pages) +int diag204_store(void *buf, int pages) { int rc; - rc = diag204((unsigned long) diag204_store_sc | - (unsigned long) diag204_info_type, pages, buf); + rc = diag204((unsigned long)diag204_store_sc | + (unsigned long)diag204_get_info_type(), pages, buf); return rc < 0 ? -ENOSYS : 0; } -static void *diag204_store(void) -{ - void *buf; - int pages, rc; - - buf = diag204_get_buffer(diag204_info_type, &pages); - if (IS_ERR(buf)) - goto out; - rc = diag204_do_store(buf, pages); - if (rc) - return ERR_PTR(rc); -out: - return buf; -} - -/* Diagnose 224 functions */ - -static int diag224_get_name_table(void) -{ - /* memory must be below 2GB */ - diag224_cpu_names = (char *) __get_free_page(GFP_KERNEL | GFP_DMA); - if (!diag224_cpu_names) - return -ENOMEM; - if (diag224(diag224_cpu_names)) { - free_page((unsigned long) diag224_cpu_names); - return -EOPNOTSUPP; - } - EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16); - return 0; -} - -static void diag224_delete_name_table(void) -{ - free_page((unsigned long) diag224_cpu_names); -} - -static int diag224_idx2name(int index, char *name) -{ - memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN), - DIAG204_CPU_NAME_LEN); - name[DIAG204_CPU_NAME_LEN] = 0; - strim(name); - return 0; -} - struct dbfs_d204_hdr { u64 len; /* Length of d204 buffer without header */ u16 version; /* Version of header */ @@ -375,7 +170,7 @@ static int dbfs_d204_create(void **data, void **data_free_ptr, size_t *size) if (!base) return -ENOMEM; d204 = PTR_ALIGN(base + sizeof(d204->hdr), PAGE_SIZE) - sizeof(d204->hdr); - rc = diag204_do_store(d204->buf, diag204_buf_pages); + rc = diag204_store(d204->buf, diag204_buf_pages); if (rc) { vfree(base); return rc; @@ -404,176 +199,21 @@ __init int hypfs_diag_init(void) return -ENODATA; } - if (diag204_info_type == DIAG204_INFO_EXT) + if (diag204_get_info_type() == DIAG204_INFO_EXT) hypfs_dbfs_create_file(&dbfs_file_d204); - if (MACHINE_IS_LPAR) { - rc = diag224_get_name_table(); - if (rc) { - pr_err("The hardware system does not provide all " - "functions required by hypfs\n"); - debugfs_remove(dbfs_d204_file); - return rc; - } + rc = hypfs_diag_fs_init(); + if (rc) { + pr_err("The hardware system does not provide all functions required by hypfs\n"); + debugfs_remove(dbfs_d204_file); } - return 0; + return rc; } void hypfs_diag_exit(void) { debugfs_remove(dbfs_d204_file); - diag224_delete_name_table(); + hypfs_diag_fs_exit(); diag204_free_buffer(); hypfs_dbfs_remove_file(&dbfs_file_d204); } - -/* - * Functions to create the directory structure - * ******************************************* - */ - -static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info) -{ - struct dentry *cpu_dir; - char buffer[TMP_SIZE]; - void *rc; - - snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_info_type, - cpu_info)); - cpu_dir = hypfs_mkdir(cpus_dir, buffer); - rc = hypfs_create_u64(cpu_dir, "mgmtime", - cpu_info__acc_time(diag204_info_type, cpu_info) - - cpu_info__lp_time(diag204_info_type, cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); - rc = hypfs_create_u64(cpu_dir, "cputime", - cpu_info__lp_time(diag204_info_type, cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); - if (diag204_info_type == DIAG204_INFO_EXT) { - rc = hypfs_create_u64(cpu_dir, "onlinetime", - cpu_info__online_time(diag204_info_type, - cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); - } - diag224_idx2name(cpu_info__ctidx(diag204_info_type, cpu_info), buffer); - rc = hypfs_create_str(cpu_dir, "type", buffer); - return PTR_ERR_OR_ZERO(rc); -} - -static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr) -{ - struct dentry *cpus_dir; - struct dentry *lpar_dir; - char lpar_name[DIAG204_LPAR_NAME_LEN + 1]; - void *cpu_info; - int i; - - part_hdr__part_name(diag204_info_type, part_hdr, lpar_name); - lpar_name[DIAG204_LPAR_NAME_LEN] = 0; - lpar_dir = hypfs_mkdir(systems_dir, lpar_name); - if (IS_ERR(lpar_dir)) - return lpar_dir; - cpus_dir = hypfs_mkdir(lpar_dir, "cpus"); - if (IS_ERR(cpus_dir)) - return cpus_dir; - cpu_info = part_hdr + part_hdr__size(diag204_info_type); - for (i = 0; i < part_hdr__rcpus(diag204_info_type, part_hdr); i++) { - int rc; - rc = hypfs_create_cpu_files(cpus_dir, cpu_info); - if (rc) - return ERR_PTR(rc); - cpu_info += cpu_info__size(diag204_info_type); - } - return cpu_info; -} - -static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info) -{ - struct dentry *cpu_dir; - char buffer[TMP_SIZE]; - void *rc; - - snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_info_type, - cpu_info)); - cpu_dir = hypfs_mkdir(cpus_dir, buffer); - if (IS_ERR(cpu_dir)) - return PTR_ERR(cpu_dir); - rc = hypfs_create_u64(cpu_dir, "mgmtime", - phys_cpu__mgm_time(diag204_info_type, cpu_info)); - if (IS_ERR(rc)) - return PTR_ERR(rc); - diag224_idx2name(phys_cpu__ctidx(diag204_info_type, cpu_info), buffer); - rc = hypfs_create_str(cpu_dir, "type", buffer); - return PTR_ERR_OR_ZERO(rc); -} - -static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr) -{ - int i; - void *cpu_info; - struct dentry *cpus_dir; - - cpus_dir = hypfs_mkdir(parent_dir, "cpus"); - if (IS_ERR(cpus_dir)) - return cpus_dir; - cpu_info = phys_hdr + phys_hdr__size(diag204_info_type); - for (i = 0; i < phys_hdr__cpus(diag204_info_type, phys_hdr); i++) { - int rc; - rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info); - if (rc) - return ERR_PTR(rc); - cpu_info += phys_cpu__size(diag204_info_type); - } - return cpu_info; -} - -int hypfs_diag_create_files(struct dentry *root) -{ - struct dentry *systems_dir, *hyp_dir; - void *time_hdr, *part_hdr; - int i, rc; - void *buffer, *ptr; - - buffer = diag204_store(); - if (IS_ERR(buffer)) - return PTR_ERR(buffer); - - systems_dir = hypfs_mkdir(root, "systems"); - if (IS_ERR(systems_dir)) { - rc = PTR_ERR(systems_dir); - goto err_out; - } - time_hdr = (struct x_info_blk_hdr *)buffer; - part_hdr = time_hdr + info_blk_hdr__size(diag204_info_type); - for (i = 0; i < info_blk_hdr__npar(diag204_info_type, time_hdr); i++) { - part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr); - if (IS_ERR(part_hdr)) { - rc = PTR_ERR(part_hdr); - goto err_out; - } - } - if (info_blk_hdr__flags(diag204_info_type, time_hdr) & - DIAG204_LPAR_PHYS_FLG) { - ptr = hypfs_create_phys_files(root, part_hdr); - if (IS_ERR(ptr)) { - rc = PTR_ERR(ptr); - goto err_out; - } - } - hyp_dir = hypfs_mkdir(root, "hyp"); - if (IS_ERR(hyp_dir)) { - rc = PTR_ERR(hyp_dir); - goto err_out; - } - ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor"); - if (IS_ERR(ptr)) { - rc = PTR_ERR(ptr); - goto err_out; - } - rc = 0; - -err_out: - return rc; -} diff --git a/arch/s390/hypfs/hypfs_diag.h b/arch/s390/hypfs/hypfs_diag.h new file mode 100644 index 000000000000..7090eff27fef --- /dev/null +++ b/arch/s390/hypfs/hypfs_diag.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hypervisor filesystem for Linux on s390. Diag 204 and 224 + * implementation. + * + * Copyright IBM Corp. 2006, 2008 + * Author(s): Michael Holzheu + */ + +#ifndef _S390_HYPFS_DIAG_H_ +#define _S390_HYPFS_DIAG_H_ + +#include + +enum diag204_format diag204_get_info_type(void); +void *diag204_get_buffer(enum diag204_format fmt, int *pages); +int diag204_store(void *buf, int pages); + +int __hypfs_diag_fs_init(void); +void __hypfs_diag_fs_exit(void); + +static inline int hypfs_diag_fs_init(void) +{ + if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) + return __hypfs_diag_fs_init(); + return 0; +} + +static inline void hypfs_diag_fs_exit(void) +{ + if (IS_ENABLED(CONFIG_S390_HYPFS_FS)) + __hypfs_diag_fs_exit(); +} + +#endif /* _S390_HYPFS_DIAG_H_ */ diff --git a/arch/s390/hypfs/hypfs_diag_fs.c b/arch/s390/hypfs/hypfs_diag_fs.c new file mode 100644 index 000000000000..00a6d370a280 --- /dev/null +++ b/arch/s390/hypfs/hypfs_diag_fs.c @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390. Diag 204 and 224 + * implementation. + * + * Copyright IBM Corp. 2006, 2008 + * Author(s): Michael Holzheu + */ + +#define KMSG_COMPONENT "hypfs" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include "hypfs_diag.h" +#include "hypfs.h" + +#define TMP_SIZE 64 /* size of temporary buffers */ + +static char *diag224_cpu_names; /* diag 224 name table */ +static int diag224_idx2name(int index, char *name); + +/* + * DIAG 204 member access functions. + * + * Since we have two different diag 204 data formats for old and new s390 + * machines, we do not access the structs directly, but use getter functions for + * each struct member instead. This should make the code more readable. + */ + +/* Time information block */ + +static inline int info_blk_hdr__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_info_blk_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_info_blk_hdr); +} + +static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->npar; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->npar; +} + +static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->flags; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->flags; +} + +/* Partition header */ + +static inline int part_hdr__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_part_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_part_hdr); +} + +static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_part_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_part_hdr *)hdr)->rcpus; +} + +static inline void part_hdr__part_name(enum diag204_format type, void *hdr, + char *name) +{ + if (type == DIAG204_INFO_SIMPLE) + memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + else /* DIAG204_INFO_EXT */ + memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + EBCASC(name, DIAG204_LPAR_NAME_LEN); + name[DIAG204_LPAR_NAME_LEN] = 0; + strim(name); +} + +/* CPU info block */ + +static inline int cpu_info__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_cpu_info); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_cpu_info); +} + +static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->ctidx; +} + +static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->cpu_addr; +} + +static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->acc_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->acc_time; +} + +static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->lp_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->lp_time; +} + +static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return 0; /* online_time not available in simple info */ + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->online_time; +} + +/* Physical header */ + +static inline int phys_hdr__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_hdr); +} + +static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_hdr *)hdr)->cpus; +} + +/* Physical CPU info block */ + +static inline int phys_cpu__size(enum diag204_format type) +{ + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_cpu); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_cpu); +} + +static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr; +} + +static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->mgm_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->mgm_time; +} + +static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) +{ + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->ctidx; +} + +/* + * Functions to create the directory structure + * ******************************************* + */ + +static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info) +{ + struct dentry *cpu_dir; + char buffer[TMP_SIZE]; + void *rc; + + snprintf(buffer, TMP_SIZE, "%d", cpu_info__cpu_addr(diag204_get_info_type(), + cpu_info)); + cpu_dir = hypfs_mkdir(cpus_dir, buffer); + rc = hypfs_create_u64(cpu_dir, "mgmtime", + cpu_info__acc_time(diag204_get_info_type(), cpu_info) - + cpu_info__lp_time(diag204_get_info_type(), cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + rc = hypfs_create_u64(cpu_dir, "cputime", + cpu_info__lp_time(diag204_get_info_type(), cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + if (diag204_get_info_type() == DIAG204_INFO_EXT) { + rc = hypfs_create_u64(cpu_dir, "onlinetime", + cpu_info__online_time(diag204_get_info_type(), + cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + } + diag224_idx2name(cpu_info__ctidx(diag204_get_info_type(), cpu_info), buffer); + rc = hypfs_create_str(cpu_dir, "type", buffer); + return PTR_ERR_OR_ZERO(rc); +} + +static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr) +{ + struct dentry *cpus_dir; + struct dentry *lpar_dir; + char lpar_name[DIAG204_LPAR_NAME_LEN + 1]; + void *cpu_info; + int i; + + part_hdr__part_name(diag204_get_info_type(), part_hdr, lpar_name); + lpar_name[DIAG204_LPAR_NAME_LEN] = 0; + lpar_dir = hypfs_mkdir(systems_dir, lpar_name); + if (IS_ERR(lpar_dir)) + return lpar_dir; + cpus_dir = hypfs_mkdir(lpar_dir, "cpus"); + if (IS_ERR(cpus_dir)) + return cpus_dir; + cpu_info = part_hdr + part_hdr__size(diag204_get_info_type()); + for (i = 0; i < part_hdr__rcpus(diag204_get_info_type(), part_hdr); i++) { + int rc; + + rc = hypfs_create_cpu_files(cpus_dir, cpu_info); + if (rc) + return ERR_PTR(rc); + cpu_info += cpu_info__size(diag204_get_info_type()); + } + return cpu_info; +} + +static int hypfs_create_phys_cpu_files(struct dentry *cpus_dir, void *cpu_info) +{ + struct dentry *cpu_dir; + char buffer[TMP_SIZE]; + void *rc; + + snprintf(buffer, TMP_SIZE, "%i", phys_cpu__cpu_addr(diag204_get_info_type(), + cpu_info)); + cpu_dir = hypfs_mkdir(cpus_dir, buffer); + if (IS_ERR(cpu_dir)) + return PTR_ERR(cpu_dir); + rc = hypfs_create_u64(cpu_dir, "mgmtime", + phys_cpu__mgm_time(diag204_get_info_type(), cpu_info)); + if (IS_ERR(rc)) + return PTR_ERR(rc); + diag224_idx2name(phys_cpu__ctidx(diag204_get_info_type(), cpu_info), buffer); + rc = hypfs_create_str(cpu_dir, "type", buffer); + return PTR_ERR_OR_ZERO(rc); +} + +static void *hypfs_create_phys_files(struct dentry *parent_dir, void *phys_hdr) +{ + int i; + void *cpu_info; + struct dentry *cpus_dir; + + cpus_dir = hypfs_mkdir(parent_dir, "cpus"); + if (IS_ERR(cpus_dir)) + return cpus_dir; + cpu_info = phys_hdr + phys_hdr__size(diag204_get_info_type()); + for (i = 0; i < phys_hdr__cpus(diag204_get_info_type(), phys_hdr); i++) { + int rc; + + rc = hypfs_create_phys_cpu_files(cpus_dir, cpu_info); + if (rc) + return ERR_PTR(rc); + cpu_info += phys_cpu__size(diag204_get_info_type()); + } + return cpu_info; +} + +int hypfs_diag_create_files(struct dentry *root) +{ + struct dentry *systems_dir, *hyp_dir; + void *time_hdr, *part_hdr; + void *buffer, *ptr; + int i, rc, pages; + + buffer = diag204_get_buffer(diag204_get_info_type(), &pages); + if (IS_ERR(buffer)) + return PTR_ERR(buffer); + rc = diag204_store(buffer, pages); + if (rc) + return rc; + + systems_dir = hypfs_mkdir(root, "systems"); + if (IS_ERR(systems_dir)) { + rc = PTR_ERR(systems_dir); + goto err_out; + } + time_hdr = (struct x_info_blk_hdr *)buffer; + part_hdr = time_hdr + info_blk_hdr__size(diag204_get_info_type()); + for (i = 0; i < info_blk_hdr__npar(diag204_get_info_type(), time_hdr); i++) { + part_hdr = hypfs_create_lpar_files(systems_dir, part_hdr); + if (IS_ERR(part_hdr)) { + rc = PTR_ERR(part_hdr); + goto err_out; + } + } + if (info_blk_hdr__flags(diag204_get_info_type(), time_hdr) & + DIAG204_LPAR_PHYS_FLG) { + ptr = hypfs_create_phys_files(root, part_hdr); + if (IS_ERR(ptr)) { + rc = PTR_ERR(ptr); + goto err_out; + } + } + hyp_dir = hypfs_mkdir(root, "hyp"); + if (IS_ERR(hyp_dir)) { + rc = PTR_ERR(hyp_dir); + goto err_out; + } + ptr = hypfs_create_str(hyp_dir, "type", "LPAR Hypervisor"); + if (IS_ERR(ptr)) { + rc = PTR_ERR(ptr); + goto err_out; + } + rc = 0; + +err_out: + return rc; +} + +/* Diagnose 224 functions */ + +static int diag224_idx2name(int index, char *name) +{ + memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN), + DIAG204_CPU_NAME_LEN); + name[DIAG204_CPU_NAME_LEN] = 0; + strim(name); + return 0; +} + +static int diag224_get_name_table(void) +{ + /* memory must be below 2GB */ + diag224_cpu_names = (char *)__get_free_page(GFP_KERNEL | GFP_DMA); + if (!diag224_cpu_names) + return -ENOMEM; + if (diag224(diag224_cpu_names)) { + free_page((unsigned long)diag224_cpu_names); + return -EOPNOTSUPP; + } + EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16); + return 0; +} + +static void diag224_delete_name_table(void) +{ + free_page((unsigned long)diag224_cpu_names); +} + +int __init __hypfs_diag_fs_init(void) +{ + if (MACHINE_IS_LPAR) + return diag224_get_name_table(); + return 0; +} + +void __hypfs_diag_fs_exit(void) +{ + diag224_delete_name_table(); +} diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c index a3d881ca0a98..3db40ad853e0 100644 --- a/arch/s390/hypfs/hypfs_vm.c +++ b/arch/s390/hypfs/hypfs_vm.c @@ -14,47 +14,15 @@ #include #include #include +#include "hypfs_vm.h" #include "hypfs.h" -#define NAME_LEN 8 #define DBFS_D2FC_HDR_VERSION 0 static char local_guest[] = " "; static char all_guests[] = "* "; static char *all_groups = all_guests; -static char *guest_query; - -struct diag2fc_data { - __u32 version; - __u32 flags; - __u64 used_cpu; - __u64 el_time; - __u64 mem_min_kb; - __u64 mem_max_kb; - __u64 mem_share_kb; - __u64 mem_used_kb; - __u32 pcpus; - __u32 lcpus; - __u32 vcpus; - __u32 ocpus; - __u32 cpu_max; - __u32 cpu_shares; - __u32 cpu_use_samp; - __u32 cpu_delay_samp; - __u32 page_wait_samp; - __u32 idle_samp; - __u32 other_samp; - __u32 total_samp; - char guest_name[NAME_LEN]; -}; - -struct diag2fc_parm_list { - char userid[NAME_LEN]; - char aci_grp[NAME_LEN]; - __u64 addr; - __u32 size; - __u32 fmt; -}; +char *diag2fc_guest_query; static int diag2fc(int size, char* query, void *addr) { @@ -62,10 +30,10 @@ static int diag2fc(int size, char* query, void *addr) unsigned long rc; struct diag2fc_parm_list parm_list; - memcpy(parm_list.userid, query, NAME_LEN); - ASCEBC(parm_list.userid, NAME_LEN); - memcpy(parm_list.aci_grp, all_groups, NAME_LEN); - ASCEBC(parm_list.aci_grp, NAME_LEN); + memcpy(parm_list.userid, query, DIAG2FC_NAME_LEN); + ASCEBC(parm_list.userid, DIAG2FC_NAME_LEN); + memcpy(parm_list.aci_grp, all_groups, DIAG2FC_NAME_LEN); + ASCEBC(parm_list.aci_grp, DIAG2FC_NAME_LEN); parm_list.addr = (unsigned long)addr; parm_list.size = size; parm_list.fmt = 0x02; @@ -87,7 +55,7 @@ static int diag2fc(int size, char* query, void *addr) /* * Allocate buffer for "query" and store diag 2fc at "offset" */ -static void *diag2fc_store(char *query, unsigned int *count, int offset) +void *diag2fc_store(char *query, unsigned int *count, int offset) { void *data; int size; @@ -108,132 +76,11 @@ static void *diag2fc_store(char *query, unsigned int *count, int offset) return data; } -static void diag2fc_free(const void *data) +void diag2fc_free(const void *data) { vfree(data); } -#define ATTRIBUTE(dir, name, member) \ -do { \ - void *rc; \ - rc = hypfs_create_u64(dir, name, member); \ - if (IS_ERR(rc)) \ - return PTR_ERR(rc); \ -} while(0) - -static int hypfs_vm_create_guest(struct dentry *systems_dir, - struct diag2fc_data *data) -{ - char guest_name[NAME_LEN + 1] = {}; - struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir; - int dedicated_flag, capped_value; - - capped_value = (data->flags & 0x00000006) >> 1; - dedicated_flag = (data->flags & 0x00000008) >> 3; - - /* guest dir */ - memcpy(guest_name, data->guest_name, NAME_LEN); - EBCASC(guest_name, NAME_LEN); - strim(guest_name); - guest_dir = hypfs_mkdir(systems_dir, guest_name); - if (IS_ERR(guest_dir)) - return PTR_ERR(guest_dir); - ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time); - - /* logical cpu information */ - cpus_dir = hypfs_mkdir(guest_dir, "cpus"); - if (IS_ERR(cpus_dir)) - return PTR_ERR(cpus_dir); - ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu); - ATTRIBUTE(cpus_dir, "capped", capped_value); - ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag); - ATTRIBUTE(cpus_dir, "count", data->vcpus); - /* - * Note: The "weight_min" attribute got the wrong name. - * The value represents the number of non-stopped (operating) - * CPUS. - */ - ATTRIBUTE(cpus_dir, "weight_min", data->ocpus); - ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max); - ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares); - - /* memory information */ - mem_dir = hypfs_mkdir(guest_dir, "mem"); - if (IS_ERR(mem_dir)) - return PTR_ERR(mem_dir); - ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb); - ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb); - ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb); - ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb); - - /* samples */ - samples_dir = hypfs_mkdir(guest_dir, "samples"); - if (IS_ERR(samples_dir)) - return PTR_ERR(samples_dir); - ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp); - ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp); - ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp); - ATTRIBUTE(samples_dir, "idle", data->idle_samp); - ATTRIBUTE(samples_dir, "other", data->other_samp); - ATTRIBUTE(samples_dir, "total", data->total_samp); - return 0; -} - -int hypfs_vm_create_files(struct dentry *root) -{ - struct dentry *dir, *file; - struct diag2fc_data *data; - unsigned int count = 0; - int rc, i; - - data = diag2fc_store(guest_query, &count, 0); - if (IS_ERR(data)) - return PTR_ERR(data); - - /* Hypervisor Info */ - dir = hypfs_mkdir(root, "hyp"); - if (IS_ERR(dir)) { - rc = PTR_ERR(dir); - goto failed; - } - file = hypfs_create_str(dir, "type", "z/VM Hypervisor"); - if (IS_ERR(file)) { - rc = PTR_ERR(file); - goto failed; - } - - /* physical cpus */ - dir = hypfs_mkdir(root, "cpus"); - if (IS_ERR(dir)) { - rc = PTR_ERR(dir); - goto failed; - } - file = hypfs_create_u64(dir, "count", data->lcpus); - if (IS_ERR(file)) { - rc = PTR_ERR(file); - goto failed; - } - - /* guests */ - dir = hypfs_mkdir(root, "systems"); - if (IS_ERR(dir)) { - rc = PTR_ERR(dir); - goto failed; - } - - for (i = 0; i < count; i++) { - rc = hypfs_vm_create_guest(dir, &(data[i])); - if (rc) - goto failed; - } - diag2fc_free(data); - return 0; - -failed: - diag2fc_free(data); - return rc; -} - struct dbfs_d2fc_hdr { u64 len; /* Length of d2fc buffer without header */ u16 version; /* Version of header */ @@ -252,7 +99,7 @@ static int dbfs_diag2fc_create(void **data, void **data_free_ptr, size_t *size) struct dbfs_d2fc *d2fc; unsigned int count; - d2fc = diag2fc_store(guest_query, &count, sizeof(d2fc->hdr)); + d2fc = diag2fc_store(diag2fc_guest_query, &count, sizeof(d2fc->hdr)); if (IS_ERR(d2fc)) return PTR_ERR(d2fc); store_tod_clock_ext(&d2fc->hdr.tod_ext); @@ -277,9 +124,9 @@ int hypfs_vm_init(void) if (!MACHINE_IS_VM) return 0; if (diag2fc(0, all_guests, NULL) > 0) - guest_query = all_guests; + diag2fc_guest_query = all_guests; else if (diag2fc(0, local_guest, NULL) > 0) - guest_query = local_guest; + diag2fc_guest_query = local_guest; else return -EACCES; hypfs_dbfs_create_file(&dbfs_file_2fc); diff --git a/arch/s390/hypfs/hypfs_vm.h b/arch/s390/hypfs/hypfs_vm.h new file mode 100644 index 000000000000..fe2e5851addd --- /dev/null +++ b/arch/s390/hypfs/hypfs_vm.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Hypervisor filesystem for Linux on s390. z/VM implementation. + * + * Copyright IBM Corp. 2006 + * Author(s): Michael Holzheu + */ + +#ifndef _S390_HYPFS_VM_H_ +#define _S390_HYPFS_VM_H_ + +#define DIAG2FC_NAME_LEN 8 + +struct diag2fc_data { + __u32 version; + __u32 flags; + __u64 used_cpu; + __u64 el_time; + __u64 mem_min_kb; + __u64 mem_max_kb; + __u64 mem_share_kb; + __u64 mem_used_kb; + __u32 pcpus; + __u32 lcpus; + __u32 vcpus; + __u32 ocpus; + __u32 cpu_max; + __u32 cpu_shares; + __u32 cpu_use_samp; + __u32 cpu_delay_samp; + __u32 page_wait_samp; + __u32 idle_samp; + __u32 other_samp; + __u32 total_samp; + char guest_name[DIAG2FC_NAME_LEN]; +}; + +struct diag2fc_parm_list { + char userid[DIAG2FC_NAME_LEN]; + char aci_grp[DIAG2FC_NAME_LEN]; + __u64 addr; + __u32 size; + __u32 fmt; +}; + +void *diag2fc_store(char *query, unsigned int *count, int offset); +void diag2fc_free(const void *data); +extern char *diag2fc_guest_query; + +#endif /* _S390_HYPFS_VM_H_ */ diff --git a/arch/s390/hypfs/hypfs_vm_fs.c b/arch/s390/hypfs/hypfs_vm_fs.c new file mode 100644 index 000000000000..6011289afa8c --- /dev/null +++ b/arch/s390/hypfs/hypfs_vm_fs.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hypervisor filesystem for Linux on s390. z/VM implementation. + * + * Copyright IBM Corp. 2006 + * Author(s): Michael Holzheu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "hypfs_vm.h" +#include "hypfs.h" + +#define ATTRIBUTE(dir, name, member) \ +do { \ + void *rc; \ + rc = hypfs_create_u64(dir, name, member); \ + if (IS_ERR(rc)) \ + return PTR_ERR(rc); \ +} while (0) + +static int hypfs_vm_create_guest(struct dentry *systems_dir, + struct diag2fc_data *data) +{ + char guest_name[DIAG2FC_NAME_LEN + 1] = {}; + struct dentry *guest_dir, *cpus_dir, *samples_dir, *mem_dir; + int dedicated_flag, capped_value; + + capped_value = (data->flags & 0x00000006) >> 1; + dedicated_flag = (data->flags & 0x00000008) >> 3; + + /* guest dir */ + memcpy(guest_name, data->guest_name, DIAG2FC_NAME_LEN); + EBCASC(guest_name, DIAG2FC_NAME_LEN); + strim(guest_name); + guest_dir = hypfs_mkdir(systems_dir, guest_name); + if (IS_ERR(guest_dir)) + return PTR_ERR(guest_dir); + ATTRIBUTE(guest_dir, "onlinetime_us", data->el_time); + + /* logical cpu information */ + cpus_dir = hypfs_mkdir(guest_dir, "cpus"); + if (IS_ERR(cpus_dir)) + return PTR_ERR(cpus_dir); + ATTRIBUTE(cpus_dir, "cputime_us", data->used_cpu); + ATTRIBUTE(cpus_dir, "capped", capped_value); + ATTRIBUTE(cpus_dir, "dedicated", dedicated_flag); + ATTRIBUTE(cpus_dir, "count", data->vcpus); + /* + * Note: The "weight_min" attribute got the wrong name. + * The value represents the number of non-stopped (operating) + * CPUS. + */ + ATTRIBUTE(cpus_dir, "weight_min", data->ocpus); + ATTRIBUTE(cpus_dir, "weight_max", data->cpu_max); + ATTRIBUTE(cpus_dir, "weight_cur", data->cpu_shares); + + /* memory information */ + mem_dir = hypfs_mkdir(guest_dir, "mem"); + if (IS_ERR(mem_dir)) + return PTR_ERR(mem_dir); + ATTRIBUTE(mem_dir, "min_KiB", data->mem_min_kb); + ATTRIBUTE(mem_dir, "max_KiB", data->mem_max_kb); + ATTRIBUTE(mem_dir, "used_KiB", data->mem_used_kb); + ATTRIBUTE(mem_dir, "share_KiB", data->mem_share_kb); + + /* samples */ + samples_dir = hypfs_mkdir(guest_dir, "samples"); + if (IS_ERR(samples_dir)) + return PTR_ERR(samples_dir); + ATTRIBUTE(samples_dir, "cpu_using", data->cpu_use_samp); + ATTRIBUTE(samples_dir, "cpu_delay", data->cpu_delay_samp); + ATTRIBUTE(samples_dir, "mem_delay", data->page_wait_samp); + ATTRIBUTE(samples_dir, "idle", data->idle_samp); + ATTRIBUTE(samples_dir, "other", data->other_samp); + ATTRIBUTE(samples_dir, "total", data->total_samp); + return 0; +} + +int hypfs_vm_create_files(struct dentry *root) +{ + struct dentry *dir, *file; + struct diag2fc_data *data; + unsigned int count = 0; + int rc, i; + + data = diag2fc_store(diag2fc_guest_query, &count, 0); + if (IS_ERR(data)) + return PTR_ERR(data); + + /* Hypervisor Info */ + dir = hypfs_mkdir(root, "hyp"); + if (IS_ERR(dir)) { + rc = PTR_ERR(dir); + goto failed; + } + file = hypfs_create_str(dir, "type", "z/VM Hypervisor"); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + goto failed; + } + + /* physical cpus */ + dir = hypfs_mkdir(root, "cpus"); + if (IS_ERR(dir)) { + rc = PTR_ERR(dir); + goto failed; + } + file = hypfs_create_u64(dir, "count", data->lcpus); + if (IS_ERR(file)) { + rc = PTR_ERR(file); + goto failed; + } + + /* guests */ + dir = hypfs_mkdir(root, "systems"); + if (IS_ERR(dir)) { + rc = PTR_ERR(dir); + goto failed; + } + + for (i = 0; i < count; i++) { + rc = hypfs_vm_create_guest(dir, &data[i]); + if (rc) + goto failed; + } + diag2fc_free(data); + return 0; + +failed: + diag2fc_free(data); + return rc; +} diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index ee919bfc8186..0d53483fec34 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -460,45 +460,18 @@ static const struct super_operations hypfs_s_ops = { .show_options = hypfs_show_options, }; -static int __init hypfs_init(void) +int __init __hypfs_fs_init(void) { int rc; - hypfs_dbfs_init(); - - if (hypfs_diag_init()) { - rc = -ENODATA; - goto fail_dbfs_exit; - } - if (hypfs_vm_init()) { - rc = -ENODATA; - goto fail_hypfs_diag_exit; - } - hypfs_sprp_init(); - if (hypfs_diag0c_init()) { - rc = -ENODATA; - goto fail_hypfs_sprp_exit; - } rc = sysfs_create_mount_point(hypervisor_kobj, "s390"); if (rc) - goto fail_hypfs_diag0c_exit; + return rc; rc = register_filesystem(&hypfs_type); if (rc) - goto fail_filesystem; + goto fail; return 0; - -fail_filesystem: +fail: sysfs_remove_mount_point(hypervisor_kobj, "s390"); -fail_hypfs_diag0c_exit: - hypfs_diag0c_exit(); -fail_hypfs_sprp_exit: - hypfs_sprp_exit(); - hypfs_vm_exit(); -fail_hypfs_diag_exit: - hypfs_diag_exit(); - pr_err("Initialization of hypfs failed with rc=%i\n", rc); -fail_dbfs_exit: - hypfs_dbfs_exit(); return rc; } -device_initcall(hypfs_init) -- cgit v1.2.3 From 1256e70a082ad855efe351146dfa39207d5e3e70 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 29 Jun 2023 12:02:19 +0200 Subject: s390/ftrace: enable HAVE_FUNCTION_GRAPH_RETVAL Add support for tracing return values in the function graph tracer. This requires return_to_handler() to record gpr2 and the frame pointer Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + arch/s390/include/asm/ftrace.h | 17 +++++++++++++++++ arch/s390/kernel/asm-offsets.c | 7 +++++++ arch/s390/kernel/mcount.S | 8 ++++++-- 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index ea0e8f34eb0d..c5be7199067b 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -174,6 +174,7 @@ config S390 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION + select HAVE_FUNCTION_GRAPH_RETVAL select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index e5c5cb1207e2..5a82b08f03cd 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -54,6 +54,23 @@ static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs * return NULL; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +struct fgraph_ret_regs { + unsigned long gpr2; + unsigned long fp; +}; + +static __always_inline unsigned long fgraph_ret_regs_return_value(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->gpr2; +} + +static __always_inline unsigned long fgraph_ret_regs_frame_pointer(struct fgraph_ret_regs *ret_regs) +{ + return ret_regs->fp; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static __always_inline unsigned long ftrace_regs_get_instruction_pointer(const struct ftrace_regs *fregs) { diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 81cf72088041..150809c9ffa5 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -177,5 +178,11 @@ int main(void) DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size)); DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line)); DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size)); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* function graph return value tracing */ + OFFSET(__FGRAPH_RET_GPR2, fgraph_ret_regs, gpr2); + OFFSET(__FGRAPH_RET_FP, fgraph_ret_regs, fp); + DEFINE(__FGRAPH_RET_SIZE, sizeof(struct fgraph_ret_regs)); +#endif return 0; } diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index dbece2803c50..fd27ff9f2cf3 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -128,10 +128,14 @@ SYM_CODE_END(ftrace_common) SYM_FUNC_START(return_to_handler) stmg %r2,%r5,32(%r15) lgr %r1,%r15 - aghi %r15,-STACK_FRAME_OVERHEAD + aghi %r15,-(STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE) stg %r1,__SF_BACKCHAIN(%r15) + aghik %r3,%r15,STACK_FRAME_OVERHEAD + stg %r1,__FGRAPH_RET_FP(%r3) + stg %r2,__FGRAPH_RET_GPR2(%r3) + lgr %r2,%r3 brasl %r14,ftrace_return_to_handler - aghi %r15,STACK_FRAME_OVERHEAD + aghi %r15,STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE lgr %r14,%r2 lmg %r2,%r5,32(%r15) BR_EX %r14 -- cgit v1.2.3 From b9b4568843bb6ceee85bf1280473f510701c82e2 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 6 Jul 2023 16:24:11 +0200 Subject: s390/kexec: make machine_kexec() depend on CONFIG_KEXEC_CORE Make machine_kexec.o and relocate_kernel.o depend on CONFIG_KEXEC_CORE option as other architectures do. Still generate machine_kexec_reloc.o unconditionally, since arch_kexec_do_relocs() function is neded by the decompressor. Suggested-by: Nathan Chancellor Reported-by: Nathan Chancellor Reported-by: Linux Kernel Functional Testing Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/kernel/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 8d7514c72bb8..0df2b88cc0da 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -37,9 +37,9 @@ CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls obj-y := head64.o traps.o time.o process.o earlypgm.o early.o setup.o idle.o vtime.o obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o cpufeature.o -obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o +obj-y += sysinfo.o lgr.o os_info.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o -obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o +obj-y += entry.o reipl.o kdebugfs.o alternative.o obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o @@ -63,6 +63,7 @@ obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o -- cgit v1.2.3 From 04b8698ae879b88f96c083292f328bd31b555422 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 11 Jul 2023 17:59:42 +0200 Subject: s390/dcssblk: use IS_ALIGNED() for alignment checks Use IS_ALIGNED() instead of cumbersome bit manipulations. Reviewed-by: Gerald Schaefer Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- drivers/s390/block/dcssblk.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 09acf3853a77..4f157dc1608e 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -868,8 +868,8 @@ dcssblk_submit_bio(struct bio *bio) dev_info = bio->bi_bdev->bd_disk->private_data; if (dev_info == NULL) goto fail; - if ((bio->bi_iter.bi_sector & 7) != 0 || - (bio->bi_iter.bi_size & 4095) != 0) + if (!IS_ALIGNED(bio->bi_iter.bi_sector, 8) || + !IS_ALIGNED(bio->bi_iter.bi_size, PAGE_SIZE)) /* Request is not page-aligned. */ goto fail; /* verify data transfer direction */ @@ -891,7 +891,8 @@ dcssblk_submit_bio(struct bio *bio) bio_for_each_segment(bvec, bio, iter) { page_addr = (unsigned long)bvec_virt(&bvec); source_addr = dev_info->start + (index<<12) + bytes_done; - if (unlikely((page_addr & 4095) != 0) || (bvec.bv_len & 4095) != 0) + if (unlikely(!IS_ALIGNED(page_addr, PAGE_SIZE) || + !IS_ALIGNED(bvec.bv_len, PAGE_SIZE))) // More paranoia. goto fail; if (bio_data_dir(bio) == READ) { -- cgit v1.2.3 From 3b53d7b131bd79d97dd553af84846fde456e029f Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 11 Jul 2023 17:41:37 +0200 Subject: s390/dcssblk: fix virtual vs physical address confusion Fix virtual vs physical address confusion (which currently are the same). Reviewed-by: Gerald Schaefer Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- drivers/s390/block/dcssblk.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 4f157dc1608e..6eafd0a34483 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -860,7 +860,7 @@ dcssblk_submit_bio(struct bio *bio) struct bio_vec bvec; struct bvec_iter iter; unsigned long index; - unsigned long page_addr; + void *page_addr; unsigned long source_addr; unsigned long bytes_done; @@ -889,19 +889,16 @@ dcssblk_submit_bio(struct bio *bio) index = (bio->bi_iter.bi_sector >> 3); bio_for_each_segment(bvec, bio, iter) { - page_addr = (unsigned long)bvec_virt(&bvec); + page_addr = bvec_virt(&bvec); source_addr = dev_info->start + (index<<12) + bytes_done; - if (unlikely(!IS_ALIGNED(page_addr, PAGE_SIZE) || + if (unlikely(!IS_ALIGNED((unsigned long)page_addr, PAGE_SIZE) || !IS_ALIGNED(bvec.bv_len, PAGE_SIZE))) // More paranoia. goto fail; - if (bio_data_dir(bio) == READ) { - memcpy((void*)page_addr, (void*)source_addr, - bvec.bv_len); - } else { - memcpy((void*)source_addr, (void*)page_addr, - bvec.bv_len); - } + if (bio_data_dir(bio) == READ) + memcpy(page_addr, __va(source_addr), bvec.bv_len); + else + memcpy(__va(source_addr), page_addr, bvec.bv_len); bytes_done += bvec.bv_len; } bio_endio(bio); -- cgit v1.2.3 From 355e30ca1a707526b23a9b016fa4f740e9379c8d Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Sun, 2 Jul 2023 22:29:10 +0200 Subject: s390/mm: remove redundant check against VMEM_MAX_PHYS The value of ident_map_size could never exceed the value of vmemmap as secured by setup_kernel_memory_layout() function: /* make sure identity map doesn't overlay with vmemmap */ ident_map_size = min(ident_map_size, vmemmap_start); Since VMEM_MAX_PHYS macro is set to vmemmap and a newly added range is checked against ident_map_size in add_memory_merged() function anyway, the check against VMEM_MAX_PHYS is redundant. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- drivers/s390/char/sclp_cmd.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 3c87057436d5..8b4575a0db9f 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -392,10 +392,6 @@ static void __init add_memory_merged(u16 rn) goto skip_add; start = rn2addr(first_rn); size = (unsigned long long) num * sclp.rzm; - if (start >= VMEM_MAX_PHYS) - goto skip_add; - if (start + size > VMEM_MAX_PHYS) - size = VMEM_MAX_PHYS - start; if (start >= ident_map_size) goto skip_add; if (start + size > ident_map_size) -- cgit v1.2.3 From 94fd522069e124297c094840473f0d9637c3d991 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Sun, 2 Jul 2023 23:11:18 +0200 Subject: s390/mm: rework arch_get_mappable_range() callback As per description in mm/memory_hotplug.c platforms should define arch_get_mappable_range() that provides maximum possible addressable physical memory range for which the linear mapping could be created. The current implementation uses VMEM_MAX_PHYS macro as the maximum mappable physical address and it is simply a cast to vmemmap. Since the address is in physical address space the natural upper limit of MAX_PHYSMEM_BITS is honoured: vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS); Further, to make sure the identity mapping would not overlay with vmemmap, the size of identity mapping could be stripped like this: ident_map_size = min(ident_map_size, vmemmap_start); Similarily, any other memory that could be added (e.g DCSS segment) should not overlay with vmemmap as well and that is prevented by using vmemmap (VMEM_MAX_PHYS macro) as the upper limit. However, while the use of VMEM_MAX_PHYS brings the desired result it actually poses two issues: 1. As described, vmemmap is handled as a physical address, although it is actually a pointer to struct page in virtual address space. 2. As vmemmap is a virtual address it could have been located anywhere in the virtual address space. However, the desired necessity to honour MAX_PHYSMEM_BITS limit prevents that. Rework arch_get_mappable_range() callback in a way it does not use VMEM_MAX_PHYS macro and does not confuse the notion of virtual vs physical address spacees as result. That paves the way for moving vmemmap elsewhere and optimizing the virtual address space layout. Introduce max_mappable preserved boot variable and let function setup_kernel_memory_layout() set it up. As result, the rest of the code is does not need to know the virtual memory layout specifics. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/boot/startup.c | 3 +++ arch/s390/include/asm/setup.h | 1 + arch/s390/kernel/setup.c | 1 + arch/s390/mm/vmem.c | 2 +- 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 64bd7ac3e35d..59abdd0ab56f 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -27,6 +27,7 @@ struct page *__bootdata_preserved(vmemmap); unsigned long __bootdata_preserved(vmemmap_size); unsigned long __bootdata_preserved(MODULES_VADDR); unsigned long __bootdata_preserved(MODULES_END); +unsigned long __bootdata_preserved(max_mappable); unsigned long __bootdata(ident_map_size); u64 __bootdata_preserved(stfle_fac_list[16]); @@ -222,6 +223,8 @@ static unsigned long setup_kernel_memory_layout(void) vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size); /* vmemmap_start is the future VMEM_MAX_PHYS, make sure it is within MAX_PHYSMEM */ vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS); + /* maximum mappable address as seen by arch_get_mappable_range() */ + max_mappable = vmemmap_start; /* make sure identity map doesn't overlay with vmemmap */ ident_map_size = min(ident_map_size, vmemmap_start); vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page); diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index f191255c60db..e795f425627a 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -74,6 +74,7 @@ extern unsigned int zlib_dfltcc_support; extern int noexec_disabled; extern unsigned long ident_map_size; +extern unsigned long max_mappable; /* The Write Back bit position in the physaddr is given by the SLPC PCI */ extern unsigned long mio_wb_bit_mask; diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 00d76448319d..393dd8385506 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -146,6 +146,7 @@ static u32 __amode31_ref *__ctl_linkage_stack = __ctl_linkage_stack_amode31; static u32 __amode31_ref *__ctl_duct = __ctl_duct_amode31; int __bootdata(noexec_disabled); +unsigned long __bootdata_preserved(max_mappable); unsigned long __bootdata(ident_map_size); struct physmem_info __bootdata(physmem_info); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index b26649233d12..be69cb2d47eb 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -531,7 +531,7 @@ struct range arch_get_mappable_range(void) struct range mhp_range; mhp_range.start = 0; - mhp_range.end = VMEM_MAX_PHYS - 1; + mhp_range.end = max_mappable - 1; return mhp_range; } -- cgit v1.2.3 From 9916bf4edac6f8f499ce7c42dafb57e242865790 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 5 Jul 2023 14:17:11 +0200 Subject: s390/extmem: improve reporting of -ERANGE error Interface segment_warning() reports maximum mappable physical address for -ERANGE error. Currently that address is the value of VMEM_MAX_PHYS macro, but that well might change. A better way to obtain that address is calling arch_get_mappable_range() callback - one that is used by vmem_add_mapping() and generates -ERANGE error in the first place. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/mm/extmem.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index 1bc42ce26599..e41869f5cc95 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -640,10 +640,13 @@ void segment_warning(int rc, char *seg_name) pr_err("There is not enough memory to load or query " "DCSS %s\n", seg_name); break; - case -ERANGE: - pr_err("DCSS %s exceeds the kernel mapping range (%lu) " - "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS); + case -ERANGE: { + struct range mhp_range = arch_get_mappable_range(); + + pr_err("DCSS %s exceeds the kernel mapping range (%llu) " + "and cannot be loaded\n", seg_name, mhp_range.end + 1); break; + } default: break; } -- cgit v1.2.3 From e7e828ebeb5d80d42c9ac514db5fb3d33367cf10 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 5 Jul 2023 15:50:19 +0200 Subject: s390/mm: get rid of VMEM_MAX_PHYS macro There are no users of VMEM_MAX_PHYS macro left, remove it. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/boot/startup.c | 1 - arch/s390/include/asm/pgtable.h | 2 -- 2 files changed, 3 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 59abdd0ab56f..a1f792fcc710 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -221,7 +221,6 @@ static unsigned long setup_kernel_memory_layout(void) pages = SECTION_ALIGN_UP(pages); /* keep vmemmap_start aligned to a top level region table entry */ vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size); - /* vmemmap_start is the future VMEM_MAX_PHYS, make sure it is within MAX_PHYSMEM */ vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS); /* maximum mappable address as seen by arch_get_mappable_range() */ max_mappable = vmemmap_start; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index c55f3c3365af..30909fe27c24 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -89,8 +89,6 @@ extern unsigned long __bootdata_preserved(VMALLOC_END); extern struct page *__bootdata_preserved(vmemmap); extern unsigned long __bootdata_preserved(vmemmap_size); -#define VMEM_MAX_PHYS ((unsigned long) vmemmap) - extern unsigned long __bootdata_preserved(MODULES_VADDR); extern unsigned long __bootdata_preserved(MODULES_END); #define MODULES_VADDR MODULES_VADDR -- cgit v1.2.3 From 5216d853cb154a4866c9984fd41f71583e8bdd39 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 14 Jul 2023 10:42:56 +0200 Subject: s390/hypfs: stop using ENOSYS error code ENOSYS should only be returned to userspace when a syscall is not implemented. The only known user, 'hyptop' is not explicitely checking for -ENOSYS, therefore use EOPNOTSUPP instead. It is very unlikely that there are other users, so this change should have no impact on userspace. Signed-off-by: Sven Schnelle Acked-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/hypfs/hypfs_diag.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c index ea4c436f86a0..279b7bba4d43 100644 --- a/arch/s390/hypfs/hypfs_diag.c +++ b/arch/s390/hypfs/hypfs_diag.c @@ -67,7 +67,7 @@ void *diag204_get_buffer(enum diag204_format fmt, int *pages) *pages = diag204((unsigned long)DIAG204_SUBC_RSI | (unsigned long)DIAG204_INFO_EXT, 0, NULL); if (*pages <= 0) - return ERR_PTR(-ENOSYS); + return ERR_PTR(-EOPNOTSUPP); } diag204_buf = __vmalloc_node(array_size(*pages, PAGE_SIZE), PAGE_SIZE, GFP_KERNEL, NUMA_NO_NODE, @@ -127,7 +127,7 @@ static int diag204_probe(void) diag204_set_info_type(DIAG204_INFO_SIMPLE); goto out; } else { - rc = -ENOSYS; + rc = -EOPNOTSUPP; goto fail_store; } out: @@ -144,7 +144,7 @@ int diag204_store(void *buf, int pages) rc = diag204((unsigned long)diag204_store_sc | (unsigned long)diag204_get_info_type(), pages, buf); - return rc < 0 ? -ENOSYS : 0; + return rc < 0 ? -EOPNOTSUPP : 0; } struct dbfs_d204_hdr { -- cgit v1.2.3 From e3123dfb5373939d65ac2b874189a773d37ac7f5 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 17 Jul 2023 10:14:32 +0200 Subject: s390/tracing: pass struct ftrace_regs to ftrace_trace_function ftrace_trace_function expects a struct ftrace_regs, but the s390 architecure code passes struct pt_regs. This isn't a problem with the current code because struct ftrace_regs contains only one member: struct pt_regs. To avoid issues in the future this should be fixed. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/kernel/asm-offsets.c | 2 ++ arch/s390/kernel/mcount.S | 57 ++++++++++++++++++++++++------------------ 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 150809c9ffa5..fa5f6885c74a 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -184,5 +184,7 @@ int main(void) OFFSET(__FGRAPH_RET_FP, fgraph_ret_regs, fp); DEFINE(__FGRAPH_RET_SIZE, sizeof(struct fgraph_ret_regs)); #endif + OFFSET(__FTRACE_REGS_PT_REGS, ftrace_regs, regs); + DEFINE(__FTRACE_REGS_SIZE, sizeof(struct ftrace_regs)); return 0; } diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index fd27ff9f2cf3..d2596e0df6fa 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -12,12 +12,19 @@ #include -#define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) -#define STACK_PTREGS (STACK_FRAME_OVERHEAD) -#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) -#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) -#define STACK_PTREGS_ORIG_GPR2 (STACK_PTREGS + __PT_ORIG_GPR2) -#define STACK_PTREGS_FLAGS (STACK_PTREGS + __PT_FLAGS) +#define STACK_FRAME_SIZE_PTREGS (STACK_FRAME_OVERHEAD + __PT_SIZE) +#define STACK_PTREGS (STACK_FRAME_OVERHEAD) +#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS) +#define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) + +#define STACK_FRAME_SIZE_FREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_SIZE) +#define STACK_FREGS (STACK_FRAME_OVERHEAD) +#define STACK_FREGS_PTREGS (STACK_FRAME_OVERHEAD + __FTRACE_REGS_PT_REGS) +#define STACK_FREGS_PTREGS_GPRS (STACK_FREGS_PTREGS + __PT_GPRS) +#define STACK_FREGS_PTREGS_PSW (STACK_FREGS_PTREGS + __PT_PSW) +#define STACK_FREGS_PTREGS_ORIG_GPR2 (STACK_FREGS_PTREGS + __PT_ORIG_GPR2) +#define STACK_FREGS_PTREGS_FLAGS (STACK_FREGS_PTREGS + __PT_FLAGS) + /* packed stack: allocate just enough for r14, r15 and backchain */ #define TRACED_FUNC_FRAME_SIZE 24 @@ -53,23 +60,23 @@ SYM_CODE_END(ftrace_stub_direct_tramp) stg %r1,__SF_BACKCHAIN(%r15) stg %r0,(__SF_GPRS+8*8)(%r15) stg %r15,(__SF_GPRS+9*8)(%r15) - # allocate pt_regs and stack frame for ftrace_trace_function - aghi %r15,-STACK_FRAME_SIZE - stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15) - xc STACK_PTREGS_ORIG_GPR2(8,%r15),STACK_PTREGS_ORIG_GPR2(%r15) + # allocate ftrace_regs and stack frame for ftrace_trace_function + aghi %r15,-STACK_FRAME_SIZE_FREGS + stg %r1,(STACK_FREGS_PTREGS_GPRS+15*8)(%r15) + xc STACK_FREGS_PTREGS_ORIG_GPR2(8,%r15),STACK_FREGS_PTREGS_ORIG_GPR2(%r15) .if \allregs == 1 - stg %r14,(STACK_PTREGS_PSW)(%r15) - mvghi STACK_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS + stg %r14,(STACK_FREGS_PTREGS_PSW)(%r15) + mvghi STACK_FREGS_PTREGS_FLAGS(%r15),_PIF_FTRACE_FULL_REGS .else - xc STACK_PTREGS_FLAGS(8,%r15),STACK_PTREGS_FLAGS(%r15) + xc STACK_FREGS_PTREGS_FLAGS(8,%r15),STACK_FREGS_PTREGS_FLAGS(%r15) .endif lg %r14,(__SF_GPRS+8*8)(%r1) # restore original return address aghi %r1,-TRACED_FUNC_FRAME_SIZE stg %r1,__SF_BACKCHAIN(%r15) - stg %r0,(STACK_PTREGS_PSW+8)(%r15) - stmg %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15) + stg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15) + stmg %r2,%r14,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) .endm SYM_CODE_START(ftrace_regs_caller) @@ -96,30 +103,30 @@ SYM_CODE_START(ftrace_common) lg %r1,0(%r1) #endif lgr %r3,%r14 - la %r5,STACK_PTREGS(%r15) + la %r5,STACK_FREGS(%r15) BASR_EX %r14,%r1 #ifdef CONFIG_FUNCTION_GRAPH_TRACER # The j instruction gets runtime patched to a nop instruction. # See ftrace_enable_ftrace_graph_caller. SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL) j .Lftrace_graph_caller_end - lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15) - lg %r4,(STACK_PTREGS_PSW+8)(%r15) + lmg %r2,%r3,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15) + lg %r4,(STACK_FREGS_PTREGS_PSW+8)(%r15) brasl %r14,prepare_ftrace_return - stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) + stg %r2,(STACK_FREGS_PTREGS_GPRS+14*8)(%r15) .Lftrace_graph_caller_end: #endif - lg %r0,(STACK_PTREGS_PSW+8)(%r15) + lg %r0,(STACK_FREGS_PTREGS_PSW+8)(%r15) #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - ltg %r1,STACK_PTREGS_ORIG_GPR2(%r15) + ltg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15) locgrz %r1,%r0 #else - lg %r1,STACK_PTREGS_ORIG_GPR2(%r15) + lg %r1,STACK_FREGS_PTREGS_ORIG_GPR2(%r15) ltgr %r1,%r1 jnz 0f lgr %r1,%r0 #endif -0: lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) +0: lmg %r2,%r15,(STACK_FREGS_PTREGS_GPRS+2*8)(%r15) BR_EX %r1 SYM_CODE_END(ftrace_common) @@ -164,11 +171,11 @@ SYM_CODE_END(ftrace_shared_hotpatch_trampoline_exrl) SYM_CODE_START(arch_rethook_trampoline) stg %r14,(__SF_GPRS+8*8)(%r15) - lay %r15,-STACK_FRAME_SIZE(%r15) + lay %r15,-STACK_FRAME_SIZE_PTREGS(%r15) stmg %r0,%r14,STACK_PTREGS_GPRS(%r15) # store original stack pointer in backchain and pt_regs - lay %r7,STACK_FRAME_SIZE(%r15) + lay %r7,STACK_FRAME_SIZE_PTREGS(%r15) stg %r7,__SF_BACKCHAIN(%r15) stg %r7,STACK_PTREGS_GPRS+(15*8)(%r15) -- cgit v1.2.3 From 37002bc6b6039e1491140869c6801e0a2deee43e Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Tue, 18 Jul 2023 07:55:02 +0300 Subject: docs: move s390 under arch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit and fix all in-tree references. Architecture-specific documentation is being moved into Documentation/arch/ as a way of cleaning up the top-level documentation directory and making the docs hierarchy more closely match the source hierarchy. Signed-off-by: Costa Shulyupin Reviewed-by: Tony Krowiak Acked-by: Jonathan Corbet Acked-by: Heiko Carstens Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/20230718045550.495428-1-costa.shul@redhat.com Signed-off-by: Heiko Carstens --- Documentation/admin-guide/kernel-parameters.txt | 4 +- Documentation/arch/index.rst | 2 +- Documentation/arch/s390/3270.ChangeLog | 44 + Documentation/arch/s390/3270.rst | 298 +++++++ Documentation/arch/s390/cds.rst | 530 +++++++++++ Documentation/arch/s390/common_io.rst | 140 +++ Documentation/arch/s390/config3270.sh | 76 ++ Documentation/arch/s390/driver-model.rst | 328 +++++++ Documentation/arch/s390/features.rst | 3 + Documentation/arch/s390/index.rst | 30 + Documentation/arch/s390/monreader.rst | 212 +++++ Documentation/arch/s390/pci.rst | 133 +++ Documentation/arch/s390/qeth.rst | 64 ++ Documentation/arch/s390/s390dbf.rst | 478 ++++++++++ Documentation/arch/s390/text_files.rst | 11 + Documentation/arch/s390/vfio-ap-locking.rst | 115 +++ Documentation/arch/s390/vfio-ap.rst | 1069 +++++++++++++++++++++++ Documentation/arch/s390/vfio-ccw.rst | 445 ++++++++++ Documentation/arch/s390/zfcpdump.rst | 50 ++ Documentation/driver-api/s390-drivers.rst | 4 +- Documentation/s390/3270.ChangeLog | 44 - Documentation/s390/3270.rst | 298 ------- Documentation/s390/cds.rst | 530 ----------- Documentation/s390/common_io.rst | 140 --- Documentation/s390/config3270.sh | 76 -- Documentation/s390/driver-model.rst | 328 ------- Documentation/s390/features.rst | 3 - Documentation/s390/index.rst | 30 - Documentation/s390/monreader.rst | 212 ----- Documentation/s390/pci.rst | 133 --- Documentation/s390/qeth.rst | 64 -- Documentation/s390/s390dbf.rst | 478 ---------- Documentation/s390/text_files.rst | 11 - Documentation/s390/vfio-ap-locking.rst | 115 --- Documentation/s390/vfio-ap.rst | 1069 ----------------------- Documentation/s390/vfio-ccw.rst | 445 ---------- Documentation/s390/zfcpdump.rst | 50 -- MAINTAINERS | 8 +- arch/s390/Kconfig | 4 +- arch/s390/include/asm/debug.h | 4 +- drivers/s390/char/zcore.c | 2 +- 41 files changed, 4040 insertions(+), 4040 deletions(-) create mode 100644 Documentation/arch/s390/3270.ChangeLog create mode 100644 Documentation/arch/s390/3270.rst create mode 100644 Documentation/arch/s390/cds.rst create mode 100644 Documentation/arch/s390/common_io.rst create mode 100644 Documentation/arch/s390/config3270.sh create mode 100644 Documentation/arch/s390/driver-model.rst create mode 100644 Documentation/arch/s390/features.rst create mode 100644 Documentation/arch/s390/index.rst create mode 100644 Documentation/arch/s390/monreader.rst create mode 100644 Documentation/arch/s390/pci.rst create mode 100644 Documentation/arch/s390/qeth.rst create mode 100644 Documentation/arch/s390/s390dbf.rst create mode 100644 Documentation/arch/s390/text_files.rst create mode 100644 Documentation/arch/s390/vfio-ap-locking.rst create mode 100644 Documentation/arch/s390/vfio-ap.rst create mode 100644 Documentation/arch/s390/vfio-ccw.rst create mode 100644 Documentation/arch/s390/zfcpdump.rst delete mode 100644 Documentation/s390/3270.ChangeLog delete mode 100644 Documentation/s390/3270.rst delete mode 100644 Documentation/s390/cds.rst delete mode 100644 Documentation/s390/common_io.rst delete mode 100644 Documentation/s390/config3270.sh delete mode 100644 Documentation/s390/driver-model.rst delete mode 100644 Documentation/s390/features.rst delete mode 100644 Documentation/s390/index.rst delete mode 100644 Documentation/s390/monreader.rst delete mode 100644 Documentation/s390/pci.rst delete mode 100644 Documentation/s390/qeth.rst delete mode 100644 Documentation/s390/s390dbf.rst delete mode 100644 Documentation/s390/text_files.rst delete mode 100644 Documentation/s390/vfio-ap-locking.rst delete mode 100644 Documentation/s390/vfio-ap.rst delete mode 100644 Documentation/s390/vfio-ccw.rst delete mode 100644 Documentation/s390/zfcpdump.rst diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a1457995fd41..94b3dc21eec8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -553,7 +553,7 @@ others). ccw_timeout_log [S390] - See Documentation/s390/common_io.rst for details. + See Documentation/arch/s390/common_io.rst for details. cgroup_disable= [KNL] Disable a particular controller or optional feature Format: {name of the controller(s) or feature(s) to disable} @@ -598,7 +598,7 @@ Setting checkreqprot to 1 is deprecated. cio_ignore= [S390] - See Documentation/s390/common_io.rst for details. + See Documentation/arch/s390/common_io.rst for details. clearcpuid=X[,X...] [X86] Disable CPUID feature X for the kernel. See diff --git a/Documentation/arch/index.rst b/Documentation/arch/index.rst index 8458b88e9b79..c9a209878cf3 100644 --- a/Documentation/arch/index.rst +++ b/Documentation/arch/index.rst @@ -21,7 +21,7 @@ implementation. parisc/index ../powerpc/index ../riscv/index - ../s390/index + s390/index sh/index sparc/index x86/index diff --git a/Documentation/arch/s390/3270.ChangeLog b/Documentation/arch/s390/3270.ChangeLog new file mode 100644 index 000000000000..ecaf60b6c381 --- /dev/null +++ b/Documentation/arch/s390/3270.ChangeLog @@ -0,0 +1,44 @@ +ChangeLog for the UTS Global 3270-support patch + +Sep 2002: Get bootup colors right on 3270 console + * In tubttybld.c, substantially revise ESC processing so that + ESC sequences (especially coloring ones) and the strings + they affect work as right as 3270 can get them. Also, set + screen height to omit the two rows used for input area, in + tty3270_open() in tubtty.c. + +Sep 2002: Dynamically get 3270 input buffer + * Oversize 3270 screen widths may exceed GEOM_MAXINPLEN columns, + so get input-area buffer dynamically when sizing the device in + tubmakemin() in tuball.c (if it's the console) or tty3270_open() + in tubtty.c (if needed). Change tubp->tty_input to be a + pointer rather than an array, in tubio.h. + +Sep 2002: Fix tubfs kmalloc()s + * Do read and write lengths correctly in fs3270_read() + and fs3270_write(), while never asking kmalloc() + for more than 0x800 bytes. Affects tubfs.c and tubio.h. + +Sep 2002: Recognize 3270 control unit type 3174 + * Recognize control-unit type 0x3174 as well as 0x327?. + The IBM 2047 device emulates a 3174 control unit. + Modularize control-unit recognition in tuball.c by + adding and invoking new tub3270_is_ours(). + +Apr 2002: Fix 3270 console reboot loop + * (Belated log entry) Fixed reboot loop if 3270 console, + in tubtty.c:ttu3270_bh(). + +Feb 6, 2001: + * This changelog is new + * tub3270 now supports 3270 console: + Specify y for CONFIG_3270 and y for CONFIG_3270_CONSOLE. + Support for 3215 will not appear if 3270 console support + is chosen. + NOTE: The default is 3270 console support, NOT 3215. + * the components are remodularized: added source modules are + tubttybld.c and tubttyscl.c, for screen-building code and + scroll-timeout code. + * tub3270 source for this (2.4.0) version is #ifdeffed to + build with both 2.4.0 and 2.2.16.2. + * color support and minimal other ESC-sequence support is added. diff --git a/Documentation/arch/s390/3270.rst b/Documentation/arch/s390/3270.rst new file mode 100644 index 000000000000..467eace91473 --- /dev/null +++ b/Documentation/arch/s390/3270.rst @@ -0,0 +1,298 @@ +=============================== +IBM 3270 Display System support +=============================== + +This file describes the driver that supports local channel attachment +of IBM 3270 devices. It consists of three sections: + + * Introduction + * Installation + * Operation + + +Introduction +============ + +This paper describes installing and operating 3270 devices under +Linux/390. A 3270 device is a block-mode rows-and-columns terminal of +which I'm sure hundreds of millions were sold by IBM and clonemakers +twenty and thirty years ago. + +You may have 3270s in-house and not know it. If you're using the +VM-ESA operating system, define a 3270 to your virtual machine by using +the command "DEF GRAF " This paper presumes you will be +defining four 3270s with the CP/CMS commands: + + - DEF GRAF 620 + - DEF GRAF 621 + - DEF GRAF 622 + - DEF GRAF 623 + +Your network connection from VM-ESA allows you to use x3270, tn3270, or +another 3270 emulator, started from an xterm window on your PC or +workstation. With the DEF GRAF command, an application such as xterm, +and this Linux-390 3270 driver, you have another way of talking to your +Linux box. + +This paper covers installation of the driver and operation of a +dialed-in x3270. + + +Installation +============ + +You install the driver by installing a patch, doing a kernel build, and +running the configuration script (config3270.sh, in this directory). + +WARNING: If you are using 3270 console support, you must rerun the +configuration script every time you change the console's address (perhaps +by using the condev= parameter in silo's /boot/parmfile). More precisely, +you should rerun the configuration script every time your set of 3270s, +including the console 3270, changes subchannel identifier relative to +one another. ReIPL as soon as possible after running the configuration +script and the resulting /tmp/mkdev3270. + +If you have chosen to make tub3270 a module, you add a line to a +configuration file under /etc/modprobe.d/. If you are working on a VM +virtual machine, you can use DEF GRAF to define virtual 3270 devices. + +You may generate both 3270 and 3215 console support, or one or the +other, or neither. If you generate both, the console type under VM is +not changed. Use #CP Q TERM to see what the current console type is. +Use #CP TERM CONMODE 3270 to change it to 3270. If you generate only +3270 console support, then the driver automatically converts your console +at boot time to a 3270 if it is a 3215. + +In brief, these are the steps: + + 1. Install the tub3270 patch + 2. (If a module) add a line to a file in `/etc/modprobe.d/*.conf` + 3. (If VM) define devices with DEF GRAF + 4. Reboot + 5. Configure + +To test that everything works, assuming VM and x3270, + + 1. Bring up an x3270 window. + 2. Use the DIAL command in that window. + 3. You should immediately see a Linux login screen. + +Here are the installation steps in detail: + + 1. The 3270 driver is a part of the official Linux kernel + source. Build a tree with the kernel source and any necessary + patches. Then do:: + + make oldconfig + (If you wish to disable 3215 console support, edit + .config; change CONFIG_TN3215's value to "n"; + and rerun "make oldconfig".) + make image + make modules + make modules_install + + 2. (Perform this step only if you have configured tub3270 as a + module.) Add a line to a file `/etc/modprobe.d/*.conf` to automatically + load the driver when it's needed. With this line added, you will see + login prompts appear on your 3270s as soon as boot is complete (or + with emulated 3270s, as soon as you dial into your vm guest using the + command "DIAL "). Since the line-mode major number is + 227, the line to add should be:: + + alias char-major-227 tub3270 + + 3. Define graphic devices to your vm guest machine, if you + haven't already. Define them before you reboot (reipl): + + - DEFINE GRAF 620 + - DEFINE GRAF 621 + - DEFINE GRAF 622 + - DEFINE GRAF 623 + + 4. Reboot. The reboot process scans hardware devices, including + 3270s, and this enables the tub3270 driver once loaded to respond + correctly to the configuration requests of the next step. If + you have chosen 3270 console support, your console now behaves + as a 3270, not a 3215. + + 5. Run the 3270 configuration script config3270. It is + distributed in this same directory, Documentation/arch/s390, as + config3270.sh. Inspect the output script it produces, + /tmp/mkdev3270, and then run that script. This will create the + necessary character special device files and make the necessary + changes to /etc/inittab. + + Then notify /sbin/init that /etc/inittab has changed, by issuing + the telinit command with the q operand:: + + cd Documentation/arch/s390 + sh config3270.sh + sh /tmp/mkdev3270 + telinit q + + This should be sufficient for your first time. If your 3270 + configuration has changed and you're reusing config3270, you + should follow these steps:: + + Change 3270 configuration + Reboot + Run config3270 and /tmp/mkdev3270 + Reboot + +Here are the testing steps in detail: + + 1. Bring up an x3270 window, or use an actual hardware 3278 or + 3279, or use the 3270 emulator of your choice. You would be + running the emulator on your PC or workstation. You would use + the command, for example:: + + x3270 vm-esa-domain-name & + + if you wanted a 3278 Model 4 with 43 rows of 80 columns, the + default model number. The driver does not take advantage of + extended attributes. + + The screen you should now see contains a VM logo with input + lines near the bottom. Use TAB to move to the bottom line, + probably labeled "COMMAND ===>". + + 2. Use the DIAL command instead of the LOGIN command to connect + to one of the virtual 3270s you defined with the DEF GRAF + commands:: + + dial my-vm-guest-name + + 3. You should immediately see a login prompt from your + Linux-390 operating system. If that does not happen, you would + see instead the line "DIALED TO my-vm-guest-name 0620". + + To troubleshoot: do these things. + + A. Is the driver loaded? Use the lsmod command (no operands) + to find out. Probably it isn't. Try loading it manually, with + the command "insmod tub3270". Does that command give error + messages? Ha! There's your problem. + + B. Is the /etc/inittab file modified as in installation step 3 + above? Use the grep command to find out; for instance, issue + "grep 3270 /etc/inittab". Nothing found? There's your + problem! + + C. Are the device special files created, as in installation + step 2 above? Use the ls -l command to find out; for instance, + issue "ls -l /dev/3270/tty620". The output should start with the + letter "c" meaning character device and should contain "227, 1" + just to the left of the device name. No such file? no "c"? + Wrong major number? Wrong minor number? There's your + problem! + + D. Do you get the message:: + + "HCPDIA047E my-vm-guest-name 0620 does not exist"? + + If so, you must issue the command "DEF GRAF 620" from your VM + 3215 console and then reboot the system. + + + +OPERATION. +========== + +The driver defines three areas on the 3270 screen: the log area, the +input area, and the status area. + +The log area takes up all but the bottom two lines of the screen. The +driver writes terminal output to it, starting at the top line and going +down. When it fills, the status area changes from "Linux Running" to +"Linux More...". After a scrolling timeout of (default) 5 sec, the +screen clears and more output is written, from the top down. + +The input area extends from the beginning of the second-to-last screen +line to the start of the status area. You type commands in this area +and hit ENTER to execute them. + +The status area initializes to "Linux Running" to give you a warm +fuzzy feeling. When the log area fills up and output awaits, it +changes to "Linux More...". At this time you can do several things or +nothing. If you do nothing, the screen will clear in (default) 5 sec +and more output will appear. You may hit ENTER with nothing typed in +the input area to toggle between "Linux More..." and "Linux Holding", +which indicates no scrolling will occur. (If you hit ENTER with "Linux +Running" and nothing typed, the application receives a newline.) + +You may change the scrolling timeout value. For example, the following +command line:: + + echo scrolltime=60 > /proc/tty/driver/tty3270 + +changes the scrolling timeout value to 60 sec. Set scrolltime to 0 if +you wish to prevent scrolling entirely. + +Other things you may do when the log area fills up are: hit PA2 to +clear the log area and write more output to it, or hit CLEAR to clear +the log area and the input area and write more output to the log area. + +Some of the Program Function (PF) and Program Attention (PA) keys are +preassigned special functions. The ones that are not yield an alarm +when pressed. + +PA1 causes a SIGINT to the currently running application. You may do +the same thing from the input area, by typing "^C" and hitting ENTER. + +PA2 causes the log area to be cleared. If output awaits, it is then +written to the log area. + +PF3 causes an EOF to be received as input by the application. You may +cause an EOF also by typing "^D" and hitting ENTER. + +No PF key is preassigned to cause a job suspension, but you may cause a +job suspension by typing "^Z" and hitting ENTER. You may wish to +assign this function to a PF key. To make PF7 cause job suspension, +execute the command:: + + echo pf7=^z > /proc/tty/driver/tty3270 + +If the input you type does not end with the two characters "^n", the +driver appends a newline character and sends it to the tty driver; +otherwise the driver strips the "^n" and does not append a newline. +The IBM 3215 driver behaves similarly. + +Pf10 causes the most recent command to be retrieved from the tube's +command stack (default depth 20) and displayed in the input area. You +may hit PF10 again for the next-most-recent command, and so on. A +command is entered into the stack only when the input area is not made +invisible (such as for password entry) and it is not identical to the +current top entry. PF10 rotates backward through the command stack; +PF11 rotates forward. You may assign the backward function to any PF +key (or PA key, for that matter), say, PA3, with the command:: + + echo -e pa3=\\033k > /proc/tty/driver/tty3270 + +This assigns the string ESC-k to PA3. Similarly, the string ESC-j +performs the forward function. (Rationale: In bash with vi-mode line +editing, ESC-k and ESC-j retrieve backward and forward history. +Suggestions welcome.) + +Is a stack size of twenty commands not to your liking? Change it on +the fly. To change to saving the last 100 commands, execute the +command:: + + echo recallsize=100 > /proc/tty/driver/tty3270 + +Have a command you issue frequently? Assign it to a PF or PA key! Use +the command:: + + echo pf24="mkdir foobar; cd foobar" > /proc/tty/driver/tty3270 + +to execute the commands mkdir foobar and cd foobar immediately when you +hit PF24. Want to see the command line first, before you execute it? +Use the -n option of the echo command:: + + echo -n pf24="mkdir foo; cd foo" > /proc/tty/driver/tty3270 + + + +Happy testing! I welcome any and all comments about this document, the +driver, etc etc. + +Dick Hitt diff --git a/Documentation/arch/s390/cds.rst b/Documentation/arch/s390/cds.rst new file mode 100644 index 000000000000..bcad2a14244a --- /dev/null +++ b/Documentation/arch/s390/cds.rst @@ -0,0 +1,530 @@ +=========================== +Linux for S/390 and zSeries +=========================== + +Common Device Support (CDS) +Device Driver I/O Support Routines + +Authors: + - Ingo Adlung + - Cornelia Huck + +Copyright, IBM Corp. 1999-2002 + +Introduction +============ + +This document describes the common device support routines for Linux/390. +Different than other hardware architectures, ESA/390 has defined a unified +I/O access method. This gives relief to the device drivers as they don't +have to deal with different bus types, polling versus interrupt +processing, shared versus non-shared interrupt processing, DMA versus port +I/O (PIO), and other hardware features more. However, this implies that +either every single device driver needs to implement the hardware I/O +attachment functionality itself, or the operating system provides for a +unified method to access the hardware, providing all the functionality that +every single device driver would have to provide itself. + +The document does not intend to explain the ESA/390 hardware architecture in +every detail.This information can be obtained from the ESA/390 Principles of +Operation manual (IBM Form. No. SA22-7201). + +In order to build common device support for ESA/390 I/O interfaces, a +functional layer was introduced that provides generic I/O access methods to +the hardware. + +The common device support layer comprises the I/O support routines defined +below. Some of them implement common Linux device driver interfaces, while +some of them are ESA/390 platform specific. + +Note: + In order to write a driver for S/390, you also need to look into the interface + described in Documentation/arch/s390/driver-model.rst. + +Note for porting drivers from 2.4: + +The major changes are: + +* The functions use a ccw_device instead of an irq (subchannel). +* All drivers must define a ccw_driver (see driver-model.txt) and the associated + functions. +* request_irq() and free_irq() are no longer done by the driver. +* The oper_handler is (kindof) replaced by the probe() and set_online() functions + of the ccw_driver. +* The not_oper_handler is (kindof) replaced by the remove() and set_offline() + functions of the ccw_driver. +* The channel device layer is gone. +* The interrupt handlers must be adapted to use a ccw_device as argument. + Moreover, they don't return a devstat, but an irb. +* Before initiating an io, the options must be set via ccw_device_set_options(). +* Instead of calling read_dev_chars()/read_conf_data(), the driver issues + the channel program and handles the interrupt itself. + +ccw_device_get_ciw() + get commands from extended sense data. + +ccw_device_start(), ccw_device_start_timeout(), ccw_device_start_key(), ccw_device_start_key_timeout() + initiate an I/O request. + +ccw_device_resume() + resume channel program execution. + +ccw_device_halt() + terminate the current I/O request processed on the device. + +do_IRQ() + generic interrupt routine. This function is called by the interrupt entry + routine whenever an I/O interrupt is presented to the system. The do_IRQ() + routine determines the interrupt status and calls the device specific + interrupt handler according to the rules (flags) defined during I/O request + initiation with do_IO(). + +The next chapters describe the functions other than do_IRQ() in more details. +The do_IRQ() interface is not described, as it is called from the Linux/390 +first level interrupt handler only and does not comprise a device driver +callable interface. Instead, the functional description of do_IO() also +describes the input to the device specific interrupt handler. + +Note: + All explanations apply also to the 64 bit architecture s390x. + + +Common Device Support (CDS) for Linux/390 Device Drivers +======================================================== + +General Information +------------------- + +The following chapters describe the I/O related interface routines the +Linux/390 common device support (CDS) provides to allow for device specific +driver implementations on the IBM ESA/390 hardware platform. Those interfaces +intend to provide the functionality required by every device driver +implementation to allow to drive a specific hardware device on the ESA/390 +platform. Some of the interface routines are specific to Linux/390 and some +of them can be found on other Linux platforms implementations too. +Miscellaneous function prototypes, data declarations, and macro definitions +can be found in the architecture specific C header file +linux/arch/s390/include/asm/irq.h. + +Overview of CDS interface concepts +---------------------------------- + +Different to other hardware platforms, the ESA/390 architecture doesn't define +interrupt lines managed by a specific interrupt controller and bus systems +that may or may not allow for shared interrupts, DMA processing, etc.. Instead, +the ESA/390 architecture has implemented a so called channel subsystem, that +provides a unified view of the devices physically attached to the systems. +Though the ESA/390 hardware platform knows about a huge variety of different +peripheral attachments like disk devices (aka. DASDs), tapes, communication +controllers, etc. they can all be accessed by a well defined access method and +they are presenting I/O completion a unified way : I/O interruptions. Every +single device is uniquely identified to the system by a so called subchannel, +where the ESA/390 architecture allows for 64k devices be attached. + +Linux, however, was first built on the Intel PC architecture, with its two +cascaded 8259 programmable interrupt controllers (PICs), that allow for a +maximum of 15 different interrupt lines. All devices attached to such a system +share those 15 interrupt levels. Devices attached to the ISA bus system must +not share interrupt levels (aka. IRQs), as the ISA bus bases on edge triggered +interrupts. MCA, EISA, PCI and other bus systems base on level triggered +interrupts, and therewith allow for shared IRQs. However, if multiple devices +present their hardware status by the same (shared) IRQ, the operating system +has to call every single device driver registered on this IRQ in order to +determine the device driver owning the device that raised the interrupt. + +Up to kernel 2.4, Linux/390 used to provide interfaces via the IRQ (subchannel). +For internal use of the common I/O layer, these are still there. However, +device drivers should use the new calling interface via the ccw_device only. + +During its startup the Linux/390 system checks for peripheral devices. Each +of those devices is uniquely defined by a so called subchannel by the ESA/390 +channel subsystem. While the subchannel numbers are system generated, each +subchannel also takes a user defined attribute, the so called device number. +Both subchannel number and device number cannot exceed 65535. During sysfs +initialisation, the information about control unit type and device types that +imply specific I/O commands (channel command words - CCWs) in order to operate +the device are gathered. Device drivers can retrieve this set of hardware +information during their initialization step to recognize the devices they +support using the information saved in the struct ccw_device given to them. +This methods implies that Linux/390 doesn't require to probe for free (not +armed) interrupt request lines (IRQs) to drive its devices with. Where +applicable, the device drivers can use issue the READ DEVICE CHARACTERISTICS +ccw to retrieve device characteristics in its online routine. + +In order to allow for easy I/O initiation the CDS layer provides a +ccw_device_start() interface that takes a device specific channel program (one +or more CCWs) as input sets up the required architecture specific control blocks +and initiates an I/O request on behalf of the device driver. The +ccw_device_start() routine allows to specify whether it expects the CDS layer +to notify the device driver for every interrupt it observes, or with final status +only. See ccw_device_start() for more details. A device driver must never issue +ESA/390 I/O commands itself, but must use the Linux/390 CDS interfaces instead. + +For long running I/O request to be canceled, the CDS layer provides the +ccw_device_halt() function. Some devices require to initially issue a HALT +SUBCHANNEL (HSCH) command without having pending I/O requests. This function is +also covered by ccw_device_halt(). + + +get_ciw() - get command information word + +This call enables a device driver to get information about supported commands +from the extended SenseID data. + +:: + + struct ciw * + ccw_device_get_ciw(struct ccw_device *cdev, __u32 cmd); + +==== ======================================================== +cdev The ccw_device for which the command is to be retrieved. +cmd The command type to be retrieved. +==== ======================================================== + +ccw_device_get_ciw() returns: + +===== ================================================================ + NULL No extended data available, invalid device or command not found. +!NULL The command requested. +===== ================================================================ + +:: + + ccw_device_start() - Initiate I/O Request + +The ccw_device_start() routines is the I/O request front-end processor. All +device driver I/O requests must be issued using this routine. A device driver +must not issue ESA/390 I/O commands itself. Instead the ccw_device_start() +routine provides all interfaces required to drive arbitrary devices. + +This description also covers the status information passed to the device +driver's interrupt handler as this is related to the rules (flags) defined +with the associated I/O request when calling ccw_device_start(). + +:: + + int ccw_device_start(struct ccw_device *cdev, + struct ccw1 *cpa, + unsigned long intparm, + __u8 lpm, + unsigned long flags); + int ccw_device_start_timeout(struct ccw_device *cdev, + struct ccw1 *cpa, + unsigned long intparm, + __u8 lpm, + unsigned long flags, + int expires); + int ccw_device_start_key(struct ccw_device *cdev, + struct ccw1 *cpa, + unsigned long intparm, + __u8 lpm, + __u8 key, + unsigned long flags); + int ccw_device_start_key_timeout(struct ccw_device *cdev, + struct ccw1 *cpa, + unsigned long intparm, + __u8 lpm, + __u8 key, + unsigned long flags, + int expires); + +============= ============================================================= +cdev ccw_device the I/O is destined for +cpa logical start address of channel program +user_intparm user specific interrupt information; will be presented + back to the device driver's interrupt handler. Allows a + device driver to associate the interrupt with a + particular I/O request. +lpm defines the channel path to be used for a specific I/O + request. A value of 0 will make cio use the opm. +key the storage key to use for the I/O (useful for operating on a + storage with a storage key != default key) +flag defines the action to be performed for I/O processing +expires timeout value in jiffies. The common I/O layer will terminate + the running program after this and call the interrupt handler + with ERR_PTR(-ETIMEDOUT) as irb. +============= ============================================================= + +Possible flag values are: + +========================= ============================================= +DOIO_ALLOW_SUSPEND channel program may become suspended +DOIO_DENY_PREFETCH don't allow for CCW prefetch; usually + this implies the channel program might + become modified +DOIO_SUPPRESS_INTER don't call the handler on intermediate status +========================= ============================================= + +The cpa parameter points to the first format 1 CCW of a channel program:: + + struct ccw1 { + __u8 cmd_code;/* command code */ + __u8 flags; /* flags, like IDA addressing, etc. */ + __u16 count; /* byte count */ + __u32 cda; /* data address */ + } __attribute__ ((packed,aligned(8))); + +with the following CCW flags values defined: + +=================== ========================= +CCW_FLAG_DC data chaining +CCW_FLAG_CC command chaining +CCW_FLAG_SLI suppress incorrect length +CCW_FLAG_SKIP skip +CCW_FLAG_PCI PCI +CCW_FLAG_IDA indirect addressing +CCW_FLAG_SUSPEND suspend +=================== ========================= + + +Via ccw_device_set_options(), the device driver may specify the following +options for the device: + +========================= ====================================== +DOIO_EARLY_NOTIFICATION allow for early interrupt notification +DOIO_REPORT_ALL report all interrupt conditions +========================= ====================================== + + +The ccw_device_start() function returns: + +======== ====================================================================== + 0 successful completion or request successfully initiated + -EBUSY The device is currently processing a previous I/O request, or there is + a status pending at the device. +-ENODEV cdev is invalid, the device is not operational or the ccw_device is + not online. +======== ====================================================================== + +When the I/O request completes, the CDS first level interrupt handler will +accumulate the status in a struct irb and then call the device interrupt handler. +The intparm field will contain the value the device driver has associated with a +particular I/O request. If a pending device status was recognized, +intparm will be set to 0 (zero). This may happen during I/O initiation or delayed +by an alert status notification. In any case this status is not related to the +current (last) I/O request. In case of a delayed status notification no special +interrupt will be presented to indicate I/O completion as the I/O request was +never started, even though ccw_device_start() returned with successful completion. + +The irb may contain an error value, and the device driver should check for this +first: + +========== ================================================================= +-ETIMEDOUT the common I/O layer terminated the request after the specified + timeout value +-EIO the common I/O layer terminated the request due to an error state +========== ================================================================= + +If the concurrent sense flag in the extended status word (esw) in the irb is +set, the field erw.scnt in the esw describes the number of device specific +sense bytes available in the extended control word irb->scsw.ecw[]. No device +sensing by the device driver itself is required. + +The device interrupt handler can use the following definitions to investigate +the primary unit check source coded in sense byte 0 : + +======================= ==== +SNS0_CMD_REJECT 0x80 +SNS0_INTERVENTION_REQ 0x40 +SNS0_BUS_OUT_CHECK 0x20 +SNS0_EQUIPMENT_CHECK 0x10 +SNS0_DATA_CHECK 0x08 +SNS0_OVERRUN 0x04 +SNS0_INCOMPL_DOMAIN 0x01 +======================= ==== + +Depending on the device status, multiple of those values may be set together. +Please refer to the device specific documentation for details. + +The irb->scsw.cstat field provides the (accumulated) subchannel status : + +========================= ============================ +SCHN_STAT_PCI program controlled interrupt +SCHN_STAT_INCORR_LEN incorrect length +SCHN_STAT_PROG_CHECK program check +SCHN_STAT_PROT_CHECK protection check +SCHN_STAT_CHN_DATA_CHK channel data check +SCHN_STAT_CHN_CTRL_CHK channel control check +SCHN_STAT_INTF_CTRL_CHK interface control check +SCHN_STAT_CHAIN_CHECK chaining check +========================= ============================ + +The irb->scsw.dstat field provides the (accumulated) device status : + +===================== ================= +DEV_STAT_ATTENTION attention +DEV_STAT_STAT_MOD status modifier +DEV_STAT_CU_END control unit end +DEV_STAT_BUSY busy +DEV_STAT_CHN_END channel end +DEV_STAT_DEV_END device end +DEV_STAT_UNIT_CHECK unit check +DEV_STAT_UNIT_EXCEP unit exception +===================== ================= + +Please see the ESA/390 Principles of Operation manual for details on the +individual flag meanings. + +Usage Notes: + +ccw_device_start() must be called disabled and with the ccw device lock held. + +The device driver is allowed to issue the next ccw_device_start() call from +within its interrupt handler already. It is not required to schedule a +bottom-half, unless a non deterministically long running error recovery procedure +or similar needs to be scheduled. During I/O processing the Linux/390 generic +I/O device driver support has already obtained the IRQ lock, i.e. the handler +must not try to obtain it again when calling ccw_device_start() or we end in a +deadlock situation! + +If a device driver relies on an I/O request to be completed prior to start the +next it can reduce I/O processing overhead by chaining a NoOp I/O command +CCW_CMD_NOOP to the end of the submitted CCW chain. This will force Channel-End +and Device-End status to be presented together, with a single interrupt. +However, this should be used with care as it implies the channel will remain +busy, not being able to process I/O requests for other devices on the same +channel. Therefore e.g. read commands should never use this technique, as the +result will be presented by a single interrupt anyway. + +In order to minimize I/O overhead, a device driver should use the +DOIO_REPORT_ALL only if the device can report intermediate interrupt +information prior to device-end the device driver urgently relies on. In this +case all I/O interruptions are presented to the device driver until final +status is recognized. + +If a device is able to recover from asynchronously presented I/O errors, it can +perform overlapping I/O using the DOIO_EARLY_NOTIFICATION flag. While some +devices always report channel-end and device-end together, with a single +interrupt, others present primary status (channel-end) when the channel is +ready for the next I/O request and secondary status (device-end) when the data +transmission has been completed at the device. + +Above flag allows to exploit this feature, e.g. for communication devices that +can handle lost data on the network to allow for enhanced I/O processing. + +Unless the channel subsystem at any time presents a secondary status interrupt, +exploiting this feature will cause only primary status interrupts to be +presented to the device driver while overlapping I/O is performed. When a +secondary status without error (alert status) is presented, this indicates +successful completion for all overlapping ccw_device_start() requests that have +been issued since the last secondary (final) status. + +Channel programs that intend to set the suspend flag on a channel command word +(CCW) must start the I/O operation with the DOIO_ALLOW_SUSPEND option or the +suspend flag will cause a channel program check. At the time the channel program +becomes suspended an intermediate interrupt will be generated by the channel +subsystem. + +ccw_device_resume() - Resume Channel Program Execution + +If a device driver chooses to suspend the current channel program execution by +setting the CCW suspend flag on a particular CCW, the channel program execution +is suspended. In order to resume channel program execution the CIO layer +provides the ccw_device_resume() routine. + +:: + + int ccw_device_resume(struct ccw_device *cdev); + +==== ================================================ +cdev ccw_device the resume operation is requested for +==== ================================================ + +The ccw_device_resume() function returns: + +========= ============================================== + 0 suspended channel program is resumed + -EBUSY status pending + -ENODEV cdev invalid or not-operational subchannel + -EINVAL resume function not applicable +-ENOTCONN there is no I/O request pending for completion +========= ============================================== + +Usage Notes: + +Please have a look at the ccw_device_start() usage notes for more details on +suspended channel programs. + +ccw_device_halt() - Halt I/O Request Processing + +Sometimes a device driver might need a possibility to stop the processing of +a long-running channel program or the device might require to initially issue +a halt subchannel (HSCH) I/O command. For those purposes the ccw_device_halt() +command is provided. + +ccw_device_halt() must be called disabled and with the ccw device lock held. + +:: + + int ccw_device_halt(struct ccw_device *cdev, + unsigned long intparm); + +======= ===================================================== +cdev ccw_device the halt operation is requested for +intparm interruption parameter; value is only used if no I/O + is outstanding, otherwise the intparm associated with + the I/O request is returned +======= ===================================================== + +The ccw_device_halt() function returns: + +======= ============================================================== + 0 request successfully initiated +-EBUSY the device is currently busy, or status pending. +-ENODEV cdev invalid. +-EINVAL The device is not operational or the ccw device is not online. +======= ============================================================== + +Usage Notes: + +A device driver may write a never-ending channel program by writing a channel +program that at its end loops back to its beginning by means of a transfer in +channel (TIC) command (CCW_CMD_TIC). Usually this is performed by network +device drivers by setting the PCI CCW flag (CCW_FLAG_PCI). Once this CCW is +executed a program controlled interrupt (PCI) is generated. The device driver +can then perform an appropriate action. Prior to interrupt of an outstanding +read to a network device (with or without PCI flag) a ccw_device_halt() +is required to end the pending operation. + +:: + + ccw_device_clear() - Terminage I/O Request Processing + +In order to terminate all I/O processing at the subchannel, the clear subchannel +(CSCH) command is used. It can be issued via ccw_device_clear(). + +ccw_device_clear() must be called disabled and with the ccw device lock held. + +:: + + int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm); + +======= =============================================== +cdev ccw_device the clear operation is requested for +intparm interruption parameter (see ccw_device_halt()) +======= =============================================== + +The ccw_device_clear() function returns: + +======= ============================================================== + 0 request successfully initiated +-ENODEV cdev invalid +-EINVAL The device is not operational or the ccw device is not online. +======= ============================================================== + +Miscellaneous Support Routines +------------------------------ + +This chapter describes various routines to be used in a Linux/390 device +driver programming environment. + +get_ccwdev_lock() + +Get the address of the device specific lock. This is then used in +spin_lock() / spin_unlock() calls. + +:: + + __u8 ccw_device_get_path_mask(struct ccw_device *cdev); + +Get the mask of the path currently available for cdev. diff --git a/Documentation/arch/s390/common_io.rst b/Documentation/arch/s390/common_io.rst new file mode 100644 index 000000000000..6dcb40cb7145 --- /dev/null +++ b/Documentation/arch/s390/common_io.rst @@ -0,0 +1,140 @@ +====================== +S/390 common I/O-Layer +====================== + +command line parameters, procfs and debugfs entries +=================================================== + +Command line parameters +----------------------- + +* ccw_timeout_log + + Enable logging of debug information in case of ccw device timeouts. + +* cio_ignore = device[,device[,..]] + + device := {all | [!]ipldev | [!]condev | [!] | [!]-} + + The given devices will be ignored by the common I/O-layer; no detection + and device sensing will be done on any of those devices. The subchannel to + which the device in question is attached will be treated as if no device was + attached. + + An ignored device can be un-ignored later; see the "/proc entries"-section for + details. + + The devices must be given either as bus ids (0.x.abcd) or as hexadecimal + device numbers (0xabcd or abcd, for 2.4 backward compatibility). If you + give a device number 0xabcd, it will be interpreted as 0.0.abcd. + + You can use the 'all' keyword to ignore all devices. The 'ipldev' and 'condev' + keywords can be used to refer to the CCW based boot device and CCW console + device respectively (these are probably useful only when combined with the '!' + operator). The '!' operator will cause the I/O-layer to _not_ ignore a device. + The command line + is parsed from left to right. + + For example:: + + cio_ignore=0.0.0023-0.0.0042,0.0.4711 + + will ignore all devices ranging from 0.0.0023 to 0.0.0042 and the device + 0.0.4711, if detected. + + As another example:: + + cio_ignore=all,!0.0.4711,!0.0.fd00-0.0.fd02 + + will ignore all devices but 0.0.4711, 0.0.fd00, 0.0.fd01, 0.0.fd02. + + By default, no devices are ignored. + + +/proc entries +------------- + +* /proc/cio_ignore + + Lists the ranges of devices (by bus id) which are ignored by common I/O. + + You can un-ignore certain or all devices by piping to /proc/cio_ignore. + "free all" will un-ignore all ignored devices, + "free , , ..." will un-ignore the specified + devices. + + For example, if devices 0.0.0023 to 0.0.0042 and 0.0.4711 are ignored, + + - echo free 0.0.0030-0.0.0032 > /proc/cio_ignore + will un-ignore devices 0.0.0030 to 0.0.0032 and will leave devices 0.0.0023 + to 0.0.002f, 0.0.0033 to 0.0.0042 and 0.0.4711 ignored; + - echo free 0.0.0041 > /proc/cio_ignore will furthermore un-ignore device + 0.0.0041; + - echo free all > /proc/cio_ignore will un-ignore all remaining ignored + devices. + + When a device is un-ignored, device recognition and sensing is performed and + the device driver will be notified if possible, so the device will become + available to the system. Note that un-ignoring is performed asynchronously. + + You can also add ranges of devices to be ignored by piping to + /proc/cio_ignore; "add , , ..." will ignore the + specified devices. + + Note: While already known devices can be added to the list of devices to be + ignored, there will be no effect on then. However, if such a device + disappears and then reappears, it will then be ignored. To make + known devices go away, you need the "purge" command (see below). + + For example:: + + "echo add 0.0.a000-0.0.accc, 0.0.af00-0.0.afff > /proc/cio_ignore" + + will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored + devices. + + You can remove already known but now ignored devices via:: + + "echo purge > /proc/cio_ignore" + + All devices ignored but still registered and not online (= not in use) + will be deregistered and thus removed from the system. + + The devices can be specified either by bus id (0.x.abcd) or, for 2.4 backward + compatibility, by the device number in hexadecimal (0xabcd or abcd). Device + numbers given as 0xabcd will be interpreted as 0.0.abcd. + +* /proc/cio_settle + + A write request to this file is blocked until all queued cio actions are + handled. This will allow userspace to wait for pending work affecting + device availability after changing cio_ignore or the hardware configuration. + +* For some of the information present in the /proc filesystem in 2.4 (namely, + /proc/subchannels and /proc/chpids), see driver-model.txt. + Information formerly in /proc/irq_count is now in /proc/interrupts. + + +debugfs entries +--------------- + +* /sys/kernel/debug/s390dbf/cio_*/ (S/390 debug feature) + + Some views generated by the debug feature to hold various debug outputs. + + - /sys/kernel/debug/s390dbf/cio_crw/sprintf + Messages from the processing of pending channel report words (machine check + handling). + + - /sys/kernel/debug/s390dbf/cio_msg/sprintf + Various debug messages from the common I/O-layer. + + - /sys/kernel/debug/s390dbf/cio_trace/hex_ascii + Logs the calling of functions in the common I/O-layer and, if applicable, + which subchannel they were called for, as well as dumps of some data + structures (like irb in an error case). + + The level of logging can be changed to be more or less verbose by piping to + /sys/kernel/debug/s390dbf/cio_*/level a number between 0 and 6; see the + documentation on the S/390 debug feature (Documentation/arch/s390/s390dbf.rst) + for details. diff --git a/Documentation/arch/s390/config3270.sh b/Documentation/arch/s390/config3270.sh new file mode 100644 index 000000000000..515e2f431487 --- /dev/null +++ b/Documentation/arch/s390/config3270.sh @@ -0,0 +1,76 @@ +#!/bin/sh +# +# config3270 -- Autoconfigure /dev/3270/* and /etc/inittab +# +# Usage: +# config3270 +# +# Output: +# /tmp/mkdev3270 +# +# Operation: +# 1. Run this script +# 2. Run the script it produces: /tmp/mkdev3270 +# 3. Issue "telinit q" or reboot, as appropriate. +# +P=/proc/tty/driver/tty3270 +ROOT= +D=$ROOT/dev +SUBD=3270 +TTY=$SUBD/tty +TUB=$SUBD/tub +SCR=$ROOT/tmp/mkdev3270 +SCRTMP=$SCR.a +GETTYLINE=:2345:respawn:/sbin/mingetty +INITTAB=$ROOT/etc/inittab +NINITTAB=$ROOT/etc/NEWinittab +OINITTAB=$ROOT/etc/OLDinittab +ADDNOTE=\\"# Additional mingettys for the 3270/tty* driver, tub3270 ---\\" + +if ! ls $P > /dev/null 2>&1; then + modprobe tub3270 > /dev/null 2>&1 +fi +ls $P > /dev/null 2>&1 || exit 1 + +# Initialize two files, one for /dev/3270 commands and one +# to replace the /etc/inittab file (old one saved in OLDinittab) +echo "#!/bin/sh" > $SCR || exit 1 +echo " " >> $SCR +echo "# Script built by /sbin/config3270" >> $SCR +if [ ! -d /dev/dasd ]; then + echo rm -rf "$D/$SUBD/*" >> $SCR +fi +echo "grep -v $TTY $INITTAB > $NINITTAB" > $SCRTMP || exit 1 +echo "echo $ADDNOTE >> $NINITTAB" >> $SCRTMP +if [ ! -d /dev/dasd ]; then + echo mkdir -p $D/$SUBD >> $SCR +fi + +# Now query the tub3270 driver for 3270 device information +# and add appropriate mknod and mingetty lines to our files +echo what=config > $P +while read devno maj min;do + if [ $min = 0 ]; then + fsmaj=$maj + if [ ! -d /dev/dasd ]; then + echo mknod $D/$TUB c $fsmaj 0 >> $SCR + echo chmod 666 $D/$TUB >> $SCR + fi + elif [ $maj = CONSOLE ]; then + if [ ! -d /dev/dasd ]; then + echo mknod $D/$TUB$devno c $fsmaj $min >> $SCR + fi + else + if [ ! -d /dev/dasd ]; then + echo mknod $D/$TTY$devno c $maj $min >>$SCR + echo mknod $D/$TUB$devno c $fsmaj $min >> $SCR + fi + echo "echo t$min$GETTYLINE $TTY$devno >> $NINITTAB" >> $SCRTMP + fi +done < $P + +echo mv $INITTAB $OINITTAB >> $SCRTMP || exit 1 +echo mv $NINITTAB $INITTAB >> $SCRTMP +cat $SCRTMP >> $SCR +rm $SCRTMP +exit 0 diff --git a/Documentation/arch/s390/driver-model.rst b/Documentation/arch/s390/driver-model.rst new file mode 100644 index 000000000000..ad4bc2dbea43 --- /dev/null +++ b/Documentation/arch/s390/driver-model.rst @@ -0,0 +1,328 @@ +============================= +S/390 driver model interfaces +============================= + +1. CCW devices +-------------- + +All devices which can be addressed by means of ccws are called 'CCW devices' - +even if they aren't actually driven by ccws. + +All ccw devices are accessed via a subchannel, this is reflected in the +structures under devices/:: + + devices/ + - system/ + - css0/ + - 0.0.0000/0.0.0815/ + - 0.0.0001/0.0.4711/ + - 0.0.0002/ + - 0.1.0000/0.1.1234/ + ... + - defunct/ + +In this example, device 0815 is accessed via subchannel 0 in subchannel set 0, +device 4711 via subchannel 1 in subchannel set 0, and subchannel 2 is a non-I/O +subchannel. Device 1234 is accessed via subchannel 0 in subchannel set 1. + +The subchannel named 'defunct' does not represent any real subchannel on the +system; it is a pseudo subchannel where disconnected ccw devices are moved to +if they are displaced by another ccw device becoming operational on their +former subchannel. The ccw devices will be moved again to a proper subchannel +if they become operational again on that subchannel. + +You should address a ccw device via its bus id (e.g. 0.0.4711); the device can +be found under bus/ccw/devices/. + +All ccw devices export some data via sysfs. + +cutype: + The control unit type / model. + +devtype: + The device type / model, if applicable. + +availability: + Can be 'good' or 'boxed'; 'no path' or 'no device' for + disconnected devices. + +online: + An interface to set the device online and offline. + In the special case of the device being disconnected (see the + notify function under 1.2), piping 0 to online will forcibly delete + the device. + +The device drivers can add entries to export per-device data and interfaces. + +There is also some data exported on a per-subchannel basis (see under +bus/css/devices/): + +chpids: + Via which chpids the device is connected. + +pimpampom: + The path installed, path available and path operational masks. + +There also might be additional data, for example for block devices. + + +1.1 Bringing up a ccw device +---------------------------- + +This is done in several steps. + +a. Each driver can provide one or more parameter interfaces where parameters can + be specified. These interfaces are also in the driver's responsibility. +b. After a. has been performed, if necessary, the device is finally brought up + via the 'online' interface. + + +1.2 Writing a driver for ccw devices +------------------------------------ + +The basic struct ccw_device and struct ccw_driver data structures can be found +under include/asm/ccwdev.h:: + + struct ccw_device { + spinlock_t *ccwlock; + struct ccw_device_private *private; + struct ccw_device_id id; + + struct ccw_driver *drv; + struct device dev; + int online; + + void (*handler) (struct ccw_device *dev, unsigned long intparm, + struct irb *irb); + }; + + struct ccw_driver { + struct module *owner; + struct ccw_device_id *ids; + int (*probe) (struct ccw_device *); + int (*remove) (struct ccw_device *); + int (*set_online) (struct ccw_device *); + int (*set_offline) (struct ccw_device *); + int (*notify) (struct ccw_device *, int); + struct device_driver driver; + char *name; + }; + +The 'private' field contains data needed for internal i/o operation only, and +is not available to the device driver. + +Each driver should declare in a MODULE_DEVICE_TABLE into which CU types/models +and/or device types/models it is interested. This information can later be found +in the struct ccw_device_id fields:: + + struct ccw_device_id { + __u16 match_flags; + + __u16 cu_type; + __u16 dev_type; + __u8 cu_model; + __u8 dev_model; + + unsigned long driver_info; + }; + +The functions in ccw_driver should be used in the following way: + +probe: + This function is called by the device layer for each device the driver + is interested in. The driver should only allocate private structures + to put in dev->driver_data and create attributes (if needed). Also, + the interrupt handler (see below) should be set here. + +:: + + int (*probe) (struct ccw_device *cdev); + +Parameters: + cdev + - the device to be probed. + + +remove: + This function is called by the device layer upon removal of the driver, + the device or the module. The driver should perform cleanups here. + +:: + + int (*remove) (struct ccw_device *cdev); + +Parameters: + cdev + - the device to be removed. + + +set_online: + This function is called by the common I/O layer when the device is + activated via the 'online' attribute. The driver should finally + setup and activate the device here. + +:: + + int (*set_online) (struct ccw_device *); + +Parameters: + cdev + - the device to be activated. The common layer has + verified that the device is not already online. + + +set_offline: This function is called by the common I/O layer when the device is + de-activated via the 'online' attribute. The driver should shut + down the device, but not de-allocate its private data. + +:: + + int (*set_offline) (struct ccw_device *); + +Parameters: + cdev + - the device to be deactivated. The common layer has + verified that the device is online. + + +notify: + This function is called by the common I/O layer for some state changes + of the device. + + Signalled to the driver are: + + * In online state, device detached (CIO_GONE) or last path gone + (CIO_NO_PATH). The driver must return !0 to keep the device; for + return code 0, the device will be deleted as usual (also when no + notify function is registered). If the driver wants to keep the + device, it is moved into disconnected state. + * In disconnected state, device operational again (CIO_OPER). The + common I/O layer performs some sanity checks on device number and + Device / CU to be reasonably sure if it is still the same device. + If not, the old device is removed and a new one registered. By the + return code of the notify function the device driver signals if it + wants the device back: !0 for keeping, 0 to make the device being + removed and re-registered. + +:: + + int (*notify) (struct ccw_device *, int); + +Parameters: + cdev + - the device whose state changed. + + event + - the event that happened. This can be one of CIO_GONE, + CIO_NO_PATH or CIO_OPER. + +The handler field of the struct ccw_device is meant to be set to the interrupt +handler for the device. In order to accommodate drivers which use several +distinct handlers (e.g. multi subchannel devices), this is a member of ccw_device +instead of ccw_driver. +The handler is registered with the common layer during set_online() processing +before the driver is called, and is deregistered during set_offline() after the +driver has been called. Also, after registering / before deregistering, path +grouping resp. disbanding of the path group (if applicable) are performed. + +:: + + void (*handler) (struct ccw_device *dev, unsigned long intparm, struct irb *irb); + +Parameters: dev - the device the handler is called for + intparm - the intparm which allows the device driver to identify + the i/o the interrupt is associated with, or to recognize + the interrupt as unsolicited. + irb - interruption response block which contains the accumulated + status. + +The device driver is called from the common ccw_device layer and can retrieve +information about the interrupt from the irb parameter. + + +1.3 ccwgroup devices +-------------------- + +The ccwgroup mechanism is designed to handle devices consisting of multiple ccw +devices, like lcs or ctc. + +The ccw driver provides a 'group' attribute. Piping bus ids of ccw devices to +this attributes creates a ccwgroup device consisting of these ccw devices (if +possible). This ccwgroup device can be set online or offline just like a normal +ccw device. + +Each ccwgroup device also provides an 'ungroup' attribute to destroy the device +again (only when offline). This is a generic ccwgroup mechanism (the driver does +not need to implement anything beyond normal removal routines). + +A ccw device which is a member of a ccwgroup device carries a pointer to the +ccwgroup device in the driver_data of its device struct. This field must not be +touched by the driver - it should use the ccwgroup device's driver_data for its +private data. + +To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in +mind that most drivers will need to implement both a ccwgroup and a ccw +driver. + + +2. Channel paths +----------------- + +Channel paths show up, like subchannels, under the channel subsystem root (css0) +and are called 'chp0.'. They have no driver and do not belong to any bus. +Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect +only the logical state and not the physical state, since we cannot track the +latter consistently due to lacking machine support (we don't need to be aware +of it anyway). + +status + - Can be 'online' or 'offline'. + Piping 'on' or 'off' sets the chpid logically online/offline. + Piping 'on' to an online chpid triggers path reprobing for all devices + the chpid connects to. This can be used to force the kernel to re-use + a channel path the user knows to be online, but the machine hasn't + created a machine check for. + +type + - The physical type of the channel path. + +shared + - Whether the channel path is shared. + +cmg + - The channel measurement group. + +3. System devices +----------------- + +3.1 xpram +--------- + +xpram shows up under devices/system/ as 'xpram'. + +3.2 cpus +-------- + +For each cpu, a directory is created under devices/system/cpu/. Each cpu has an +attribute 'online' which can be 0 or 1. + + +4. Other devices +---------------- + +4.1 Netiucv +----------- + +The netiucv driver creates an attribute 'connection' under +bus/iucv/drivers/netiucv. Piping to this attribute creates a new netiucv +connection to the specified host. + +Netiucv connections show up under devices/iucv/ as "netiucv". The interface +number is assigned sequentially to the connections defined via the 'connection' +attribute. + +user + - shows the connection partner. + +buffer + - maximum buffer size. Pipe to it to change buffer size. diff --git a/Documentation/arch/s390/features.rst b/Documentation/arch/s390/features.rst new file mode 100644 index 000000000000..57c296a9d8f3 --- /dev/null +++ b/Documentation/arch/s390/features.rst @@ -0,0 +1,3 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. kernel-feat:: $srctree/Documentation/features s390 diff --git a/Documentation/arch/s390/index.rst b/Documentation/arch/s390/index.rst new file mode 100644 index 000000000000..73c79bf586fd --- /dev/null +++ b/Documentation/arch/s390/index.rst @@ -0,0 +1,30 @@ +================= +s390 Architecture +================= + +.. toctree:: + :maxdepth: 1 + + cds + 3270 + driver-model + monreader + qeth + s390dbf + vfio-ap + vfio-ap-locking + vfio-ccw + zfcpdump + common_io + pci + + text_files + + features + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/arch/s390/monreader.rst b/Documentation/arch/s390/monreader.rst new file mode 100644 index 000000000000..21cdfb699b49 --- /dev/null +++ b/Documentation/arch/s390/monreader.rst @@ -0,0 +1,212 @@ +================================================= +Linux API for read access to z/VM Monitor Records +================================================= + +Date : 2004-Nov-26 + +Author: Gerald Schaefer (geraldsc@de.ibm.com) + + + + +Description +=========== +This item delivers a new Linux API in the form of a misc char device that is +usable from user space and allows read access to the z/VM Monitor Records +collected by the `*MONITOR` System Service of z/VM. + + +User Requirements +================= +The z/VM guest on which you want to access this API needs to be configured in +order to allow IUCV connections to the `*MONITOR` service, i.e. it needs the +IUCV `*MONITOR` statement in its user entry. If the monitor DCSS to be used is +restricted (likely), you also need the NAMESAVE statement. +This item will use the IUCV device driver to access the z/VM services, so you +need a kernel with IUCV support. You also need z/VM version 4.4 or 5.1. + +There are two options for being able to load the monitor DCSS (examples assume +that the monitor DCSS begins at 144 MB and ends at 152 MB). You can query the +location of the monitor DCSS with the Class E privileged CP command Q NSS MAP +(the values BEGPAG and ENDPAG are given in units of 4K pages). + +See also "CP Command and Utility Reference" (SC24-6081-00) for more information +on the DEF STOR and Q NSS MAP commands, as well as "Saved Segments Planning +and Administration" (SC24-6116-00) for more information on DCSSes. + +1st option: +----------- +You can use the CP command DEF STOR CONFIG to define a "memory hole" in your +guest virtual storage around the address range of the DCSS. + +Example: DEF STOR CONFIG 0.140M 200M.200M + +This defines two blocks of storage, the first is 140MB in size an begins at +address 0MB, the second is 200MB in size and begins at address 200MB, +resulting in a total storage of 340MB. Note that the first block should +always start at 0 and be at least 64MB in size. + +2nd option: +----------- +Your guest virtual storage has to end below the starting address of the DCSS +and you have to specify the "mem=" kernel parameter in your parmfile with a +value greater than the ending address of the DCSS. + +Example:: + + DEF STOR 140M + +This defines 140MB storage size for your guest, the parameter "mem=160M" is +added to the parmfile. + + +User Interface +============== +The char device is implemented as a kernel module named "monreader", +which can be loaded via the modprobe command, or it can be compiled into the +kernel instead. There is one optional module (or kernel) parameter, "mondcss", +to specify the name of the monitor DCSS. If the module is compiled into the +kernel, the kernel parameter "monreader.mondcss=" can be specified +in the parmfile. + +The default name for the DCSS is "MONDCSS" if none is specified. In case that +there are other users already connected to the `*MONITOR` service (e.g. +Performance Toolkit), the monitor DCSS is already defined and you have to use +the same DCSS. The CP command Q MONITOR (Class E privileged) shows the name +of the monitor DCSS, if already defined, and the users connected to the +`*MONITOR` service. +Refer to the "z/VM Performance" book (SC24-6109-00) on how to create a monitor +DCSS if your z/VM doesn't have one already, you need Class E privileges to +define and save a DCSS. + +Example: +-------- + +:: + + modprobe monreader mondcss=MYDCSS + +This loads the module and sets the DCSS name to "MYDCSS". + +NOTE: +----- +This API provides no interface to control the `*MONITOR` service, e.g. specify +which data should be collected. This can be done by the CP command MONITOR +(Class E privileged), see "CP Command and Utility Reference". + +Device nodes with udev: +----------------------- +After loading the module, a char device will be created along with the device +node //monreader. + +Device nodes without udev: +-------------------------- +If your distribution does not support udev, a device node will not be created +automatically and you have to create it manually after loading the module. +Therefore you need to know the major and minor numbers of the device. These +numbers can be found in /sys/class/misc/monreader/dev. + +Typing cat /sys/class/misc/monreader/dev will give an output of the form +:. The device node can be created via the mknod command, enter +mknod c , where is the name of the device node +to be created. + +Example: +-------- + +:: + + # modprobe monreader + # cat /sys/class/misc/monreader/dev + 10:63 + # mknod /dev/monreader c 10 63 + +This loads the module with the default monitor DCSS (MONDCSS) and creates a +device node. + +File operations: +---------------- +The following file operations are supported: open, release, read, poll. +There are two alternative methods for reading: either non-blocking read in +conjunction with polling, or blocking read without polling. IOCTLs are not +supported. + +Read: +----- +Reading from the device provides a 12 Byte monitor control element (MCE), +followed by a set of one or more contiguous monitor records (similar to the +output of the CMS utility MONWRITE without the 4K control blocks). The MCE +contains information on the type of the following record set (sample/event +data), the monitor domains contained within it and the start and end address +of the record set in the monitor DCSS. The start and end address can be used +to determine the size of the record set, the end address is the address of the +last byte of data. The start address is needed to handle "end-of-frame" records +correctly (domain 1, record 13), i.e. it can be used to determine the record +start offset relative to a 4K page (frame) boundary. + +See "Appendix A: `*MONITOR`" in the "z/VM Performance" document for a description +of the monitor control element layout. The layout of the monitor records can +be found here (z/VM 5.1): https://www.vm.ibm.com/pubs/mon510/index.html + +The layout of the data stream provided by the monreader device is as follows:: + + ... + <0 byte read> + \ + | + ... |- data set + | + / + <0 byte read> + ... + +There may be more than one combination of MCE and corresponding record set +within one data set and the end of each data set is indicated by a successful +read with a return value of 0 (0 byte read). +Any received data must be considered invalid until a complete set was +read successfully, including the closing 0 byte read. Therefore you should +always read the complete set into a buffer before processing the data. + +The maximum size of a data set can be as large as the size of the +monitor DCSS, so design the buffer adequately or use dynamic memory allocation. +The size of the monitor DCSS will be printed into syslog after loading the +module. You can also use the (Class E privileged) CP command Q NSS MAP to +list all available segments and information about them. + +As with most char devices, error conditions are indicated by returning a +negative value for the number of bytes read. In this case, the errno variable +indicates the error condition: + +EIO: + reply failed, read data is invalid and the application + should discard the data read since the last successful read with 0 size. +EFAULT: + copy_to_user failed, read data is invalid and the application should + discard the data read since the last successful read with 0 size. +EAGAIN: + occurs on a non-blocking read if there is no data available at the + moment. There is no data missing or corrupted, just try again or rather + use polling for non-blocking reads. +EOVERFLOW: + message limit reached, the data read since the last successful + read with 0 size is valid but subsequent records may be missing. + +In the last case (EOVERFLOW) there may be missing data, in the first two cases +(EIO, EFAULT) there will be missing data. It's up to the application if it will +continue reading subsequent data or rather exit. + +Open: +----- +Only one user is allowed to open the char device. If it is already in use, the +open function will fail (return a negative value) and set errno to EBUSY. +The open function may also fail if an IUCV connection to the `*MONITOR` service +cannot be established. In this case errno will be set to EIO and an error +message with an IPUSER SEVER code will be printed into syslog. The IPUSER SEVER +codes are described in the "z/VM Performance" book, Appendix A. + +NOTE: +----- +As soon as the device is opened, incoming messages will be accepted and they +will account for the message limit, i.e. opening the device without reading +from it will provoke the "message limit reached" error (EOVERFLOW error code) +eventually. diff --git a/Documentation/arch/s390/pci.rst b/Documentation/arch/s390/pci.rst new file mode 100644 index 000000000000..d5755484d8e7 --- /dev/null +++ b/Documentation/arch/s390/pci.rst @@ -0,0 +1,133 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========= +S/390 PCI +========= + +Authors: + - Pierre Morel + +Copyright, IBM Corp. 2020 + + +Command line parameters and debugfs entries +=========================================== + +Command line parameters +----------------------- + +* nomio + + Do not use PCI Mapped I/O (MIO) instructions. + +* norid + + Ignore the RID field and force use of one PCI domain per PCI function. + +debugfs entries +--------------- + +The S/390 debug feature (s390dbf) generates views to hold various debug results in sysfs directories of the form: + + * /sys/kernel/debug/s390dbf/pci_*/ + +For example: + + - /sys/kernel/debug/s390dbf/pci_msg/sprintf + Holds messages from the processing of PCI events, like machine check handling + and setting of global functionality, like UID checking. + + Change the level of logging to be more or less verbose by piping + a number between 0 and 6 to /sys/kernel/debug/s390dbf/pci_*/level. For + details, see the documentation on the S/390 debug feature at + Documentation/arch/s390/s390dbf.rst. + +Sysfs entries +============= + +Entries specific to zPCI functions and entries that hold zPCI information. + +* /sys/bus/pci/slots/XXXXXXXX + + The slot entries are set up using the function identifier (FID) of the + PCI function. The format depicted as XXXXXXXX above is 8 hexadecimal digits + with 0 padding and lower case hexadecimal digits. + + - /sys/bus/pci/slots/XXXXXXXX/power + + A physical function that currently supports a virtual function cannot be + powered off until all virtual functions are removed with: + echo 0 > /sys/bus/pci/devices/XXXX:XX:XX.X/sriov_numvf + +* /sys/bus/pci/devices/XXXX:XX:XX.X/ + + - function_id + A zPCI function identifier that uniquely identifies the function in the Z server. + + - function_handle + Low-level identifier used for a configured PCI function. + It might be useful for debugging. + + - pchid + Model-dependent location of the I/O adapter. + + - pfgid + PCI function group ID, functions that share identical functionality + use a common identifier. + A PCI group defines interrupts, IOMMU, IOTLB, and DMA specifics. + + - vfn + The virtual function number, from 1 to N for virtual functions, + 0 for physical functions. + + - pft + The PCI function type + + - port + The port corresponds to the physical port the function is attached to. + It also gives an indication of the physical function a virtual function + is attached to. + + - uid + The user identifier (UID) may be defined as part of the machine + configuration or the z/VM or KVM guest configuration. If the accompanying + uid_is_unique attribute is 1 the platform guarantees that the UID is unique + within that instance and no devices with the same UID can be attached + during the lifetime of the system. + + - uid_is_unique + Indicates whether the user identifier (UID) is guaranteed to be and remain + unique within this Linux instance. + + - pfip/segmentX + The segments determine the isolation of a function. + They correspond to the physical path to the function. + The more the segments are different, the more the functions are isolated. + +Enumeration and hotplug +======================= + +The PCI address consists of four parts: domain, bus, device and function, +and is of this form: DDDD:BB:dd.f + +* When not using multi-functions (norid is set, or the firmware does not + support multi-functions): + + - There is only one function per domain. + + - The domain is set from the zPCI function's UID as defined during the + LPAR creation. + +* When using multi-functions (norid parameter is not set), + zPCI functions are addressed differently: + + - There is still only one bus per domain. + + - There can be up to 256 functions per bus. + + - The domain part of the address of all functions for + a multi-Function device is set from the zPCI function's UID as defined + in the LPAR creation for the function zero. + + - New functions will only be ready for use after the function zero + (the function with devfn 0) has been enumerated. diff --git a/Documentation/arch/s390/qeth.rst b/Documentation/arch/s390/qeth.rst new file mode 100644 index 000000000000..f02fdaa68de0 --- /dev/null +++ b/Documentation/arch/s390/qeth.rst @@ -0,0 +1,64 @@ +============================= +IBM s390 QDIO Ethernet Driver +============================= + +OSA and HiperSockets Bridge Port Support +======================================== + +Uevents +------- + +To generate the events the device must be assigned a role of either +a primary or a secondary Bridge Port. For more information, see +"z/VM Connectivity, SC24-6174". + +When run on an OSA or HiperSockets Bridge Capable Port hardware, and the state +of some configured Bridge Port device on the channel changes, a udev +event with ACTION=CHANGE is emitted on behalf of the corresponding +ccwgroup device. The event has the following attributes: + +BRIDGEPORT=statechange + indicates that the Bridge Port device changed + its state. + +ROLE={primary|secondary|none} + the role assigned to the port. + +STATE={active|standby|inactive} + the newly assumed state of the port. + +When run on HiperSockets Bridge Capable Port hardware with host address +notifications enabled, a udev event with ACTION=CHANGE is emitted. +It is emitted on behalf of the corresponding ccwgroup device when a host +or a VLAN is registered or unregistered on the network served by the device. +The event has the following attributes: + +BRIDGEDHOST={reset|register|deregister|abort} + host address + notifications are started afresh, a new host or VLAN is registered or + deregistered on the Bridge Port HiperSockets channel, or address + notifications are aborted. + +VLAN=numeric-vlan-id + VLAN ID on which the event occurred. Not included + if no VLAN is involved in the event. + +MAC=xx:xx:xx:xx:xx:xx + MAC address of the host that is being registered + or deregistered from the HiperSockets channel. Not reported if the + event reports the creation or destruction of a VLAN. + +NTOK_BUSID=x.y.zzzz + device bus ID (CSSID, SSID and device number). + +NTOK_IID=xx + device IID. + +NTOK_CHPID=xx + device CHPID. + +NTOK_CHID=xxxx + device channel ID. + +Note that the `NTOK_*` attributes refer to devices other than the one +connected to the system on which the OS is running. diff --git a/Documentation/arch/s390/s390dbf.rst b/Documentation/arch/s390/s390dbf.rst new file mode 100644 index 000000000000..af8bdc3629e7 --- /dev/null +++ b/Documentation/arch/s390/s390dbf.rst @@ -0,0 +1,478 @@ +================== +S390 Debug Feature +================== + +files: + - arch/s390/kernel/debug.c + - arch/s390/include/asm/debug.h + +Description: +------------ +The goal of this feature is to provide a kernel debug logging API +where log records can be stored efficiently in memory, where each component +(e.g. device drivers) can have one separate debug log. +One purpose of this is to inspect the debug logs after a production system crash +in order to analyze the reason for the crash. + +If the system still runs but only a subcomponent which uses dbf fails, +it is possible to look at the debug logs on a live system via the Linux +debugfs filesystem. + +The debug feature may also very useful for kernel and driver development. + +Design: +------- +Kernel components (e.g. device drivers) can register themselves at the debug +feature with the function call :c:func:`debug_register()`. +This function initializes a +debug log for the caller. For each debug log exists a number of debug areas +where exactly one is active at one time. Each debug area consists of contiguous +pages in memory. In the debug areas there are stored debug entries (log records) +which are written by event- and exception-calls. + +An event-call writes the specified debug entry to the active debug +area and updates the log pointer for the active area. If the end +of the active debug area is reached, a wrap around is done (ring buffer) +and the next debug entry will be written at the beginning of the active +debug area. + +An exception-call writes the specified debug entry to the log and +switches to the next debug area. This is done in order to be sure +that the records which describe the origin of the exception are not +overwritten when a wrap around for the current area occurs. + +The debug areas themselves are also ordered in form of a ring buffer. +When an exception is thrown in the last debug area, the following debug +entries are then written again in the very first area. + +There are four versions for the event- and exception-calls: One for +logging raw data, one for text, one for numbers (unsigned int and long), +and one for sprintf-like formatted strings. + +Each debug entry contains the following data: + +- Timestamp +- Cpu-Number of calling task +- Level of debug entry (0...6) +- Return Address to caller +- Flag, if entry is an exception or not + +The debug logs can be inspected in a live system through entries in +the debugfs-filesystem. Under the toplevel directory "``s390dbf``" there is +a directory for each registered component, which is named like the +corresponding component. The debugfs normally should be mounted to +``/sys/kernel/debug`` therefore the debug feature can be accessed under +``/sys/kernel/debug/s390dbf``. + +The content of the directories are files which represent different views +to the debug log. Each component can decide which views should be +used through registering them with the function :c:func:`debug_register_view()`. +Predefined views for hex/ascii and sprintf data are provided. +It is also possible to define other views. The content of +a view can be inspected simply by reading the corresponding debugfs file. + +All debug logs have an actual debug level (range from 0 to 6). +The default level is 3. Event and Exception functions have a :c:data:`level` +parameter. Only debug entries with a level that is lower or equal +than the actual level are written to the log. This means, when +writing events, high priority log entries should have a low level +value whereas low priority entries should have a high one. +The actual debug level can be changed with the help of the debugfs-filesystem +through writing a number string "x" to the ``level`` debugfs file which is +provided for every debug log. Debugging can be switched off completely +by using "-" on the ``level`` debugfs file. + +Example:: + + > echo "-" > /sys/kernel/debug/s390dbf/dasd/level + +It is also possible to deactivate the debug feature globally for every +debug log. You can change the behavior using 2 sysctl parameters in +``/proc/sys/s390dbf``: + +There are currently 2 possible triggers, which stop the debug feature +globally. The first possibility is to use the ``debug_active`` sysctl. If +set to 1 the debug feature is running. If ``debug_active`` is set to 0 the +debug feature is turned off. + +The second trigger which stops the debug feature is a kernel oops. +That prevents the debug feature from overwriting debug information that +happened before the oops. After an oops you can reactivate the debug feature +by piping 1 to ``/proc/sys/s390dbf/debug_active``. Nevertheless, it's not +suggested to use an oopsed kernel in a production environment. + +If you want to disallow the deactivation of the debug feature, you can use +the ``debug_stoppable`` sysctl. If you set ``debug_stoppable`` to 0 the debug +feature cannot be stopped. If the debug feature is already stopped, it +will stay deactivated. + +Kernel Interfaces: +------------------ + +.. kernel-doc:: arch/s390/kernel/debug.c +.. kernel-doc:: arch/s390/include/asm/debug.h + +Predefined views: +----------------- + +.. code-block:: c + + extern struct debug_view debug_hex_ascii_view; + + extern struct debug_view debug_sprintf_view; + +Examples +-------- + +.. code-block:: c + + /* + * hex_ascii-view Example + */ + + #include + #include + + static debug_info_t *debug_info; + + static int init(void) + { + /* register 4 debug areas with one page each and 4 byte data field */ + + debug_info = debug_register("test", 1, 4, 4 ); + debug_register_view(debug_info, &debug_hex_ascii_view); + + debug_text_event(debug_info, 4 , "one "); + debug_int_exception(debug_info, 4, 4711); + debug_event(debug_info, 3, &debug_info, 4); + + return 0; + } + + static void cleanup(void) + { + debug_unregister(debug_info); + } + + module_init(init); + module_exit(cleanup); + +.. code-block:: c + + /* + * sprintf-view Example + */ + + #include + #include + + static debug_info_t *debug_info; + + static int init(void) + { + /* register 4 debug areas with one page each and data field for */ + /* format string pointer + 2 varargs (= 3 * sizeof(long)) */ + + debug_info = debug_register("test", 1, 4, sizeof(long) * 3); + debug_register_view(debug_info, &debug_sprintf_view); + + debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__); + debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info); + + return 0; + } + + static void cleanup(void) + { + debug_unregister(debug_info); + } + + module_init(init); + module_exit(cleanup); + +Debugfs Interface +----------------- +Views to the debug logs can be investigated through reading the corresponding +debugfs-files: + +Example:: + + > ls /sys/kernel/debug/s390dbf/dasd + flush hex_ascii level pages + > cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s + 00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | .... + 00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE + 00 00974733272:682213 2 - 02 0006adf6 07 ea 4a 90 | .... + 00 00974733272:682281 1 * 02 0006ab08 41 4c 4c 43 | EXCP + 01 00974733272:682284 2 - 02 0006ab16 45 43 4b 44 | ECKD + 01 00974733272:682287 2 - 02 0006ab28 00 00 00 04 | .... + 01 00974733272:682289 2 - 02 0006ab3e 00 00 00 20 | ... + 01 00974733272:682297 2 - 02 0006ad7e 07 ea 4a 90 | .... + 01 00974733272:684384 2 - 00 0006ade6 46 52 45 45 | FREE + 01 00974733272:684388 2 - 00 0006adf6 07 ea 4a 90 | .... + +See section about predefined views for explanation of the above output! + +Changing the debug level +------------------------ + +Example:: + + + > cat /sys/kernel/debug/s390dbf/dasd/level + 3 + > echo "5" > /sys/kernel/debug/s390dbf/dasd/level + > cat /sys/kernel/debug/s390dbf/dasd/level + 5 + +Flushing debug areas +-------------------- +Debug areas can be flushed with piping the number of the desired +area (0...n) to the debugfs file "flush". When using "-" all debug areas +are flushed. + +Examples: + +1. Flush debug area 0:: + + > echo "0" > /sys/kernel/debug/s390dbf/dasd/flush + +2. Flush all debug areas:: + + > echo "-" > /sys/kernel/debug/s390dbf/dasd/flush + +Changing the size of debug areas +------------------------------------ +It is possible the change the size of debug areas through piping +the number of pages to the debugfs file "pages". The resize request will +also flush the debug areas. + +Example: + +Define 4 pages for the debug areas of debug feature "dasd":: + + > echo "4" > /sys/kernel/debug/s390dbf/dasd/pages + +Stopping the debug feature +-------------------------- +Example: + +1. Check if stopping is allowed:: + + > cat /proc/sys/s390dbf/debug_stoppable + +2. Stop debug feature:: + + > echo 0 > /proc/sys/s390dbf/debug_active + +crash Interface +---------------- +The ``crash`` tool since v5.1.0 has a built-in command +``s390dbf`` to display all the debug logs or export them to the file system. +With this tool it is possible +to investigate the debug logs on a live system and with a memory dump after +a system crash. + +Investigating raw memory +------------------------ +One last possibility to investigate the debug logs at a live +system and after a system crash is to look at the raw memory +under VM or at the Service Element. +It is possible to find the anchor of the debug-logs through +the ``debug_area_first`` symbol in the System map. Then one has +to follow the correct pointers of the data-structures defined +in debug.h and find the debug-areas in memory. +Normally modules which use the debug feature will also have +a global variable with the pointer to the debug-logs. Following +this pointer it will also be possible to find the debug logs in +memory. + +For this method it is recommended to use '16 * x + 4' byte (x = 0..n) +for the length of the data field in :c:func:`debug_register()` in +order to see the debug entries well formatted. + + +Predefined Views +---------------- + +There are two predefined views: hex_ascii and sprintf. +The hex_ascii view shows the data field in hex and ascii representation +(e.g. ``45 43 4b 44 | ECKD``). + +The sprintf view formats the debug entries in the same way as the sprintf +function would do. The sprintf event/exception functions write to the +debug entry a pointer to the format string (size = sizeof(long)) +and for each vararg a long value. So e.g. for a debug entry with a format +string plus two varargs one would need to allocate a (3 * sizeof(long)) +byte data area in the debug_register() function. + +IMPORTANT: + Using "%s" in sprintf event functions is dangerous. You can only + use "%s" in the sprintf event functions, if the memory for the passed string + is available as long as the debug feature exists. The reason behind this is + that due to performance considerations only a pointer to the string is stored + in the debug feature. If you log a string that is freed afterwards, you will + get an OOPS when inspecting the debug feature, because then the debug feature + will access the already freed memory. + +NOTE: + If using the sprintf view do NOT use other event/exception functions + than the sprintf-event and -exception functions. + +The format of the hex_ascii and sprintf view is as follows: + +- Number of area +- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated + Universal Time (UTC), January 1, 1970) +- level of debug entry +- Exception flag (* = Exception) +- Cpu-Number of calling task +- Return Address to caller +- data field + +A typical line of the hex_ascii view will look like the following (first line +is only for explanation and will not be displayed when 'cating' the view):: + + area time level exception cpu caller data (hex + ascii) + -------------------------------------------------------------------------- + 00 00964419409:440690 1 - 00 88023fe + + +Defining views +-------------- + +Views are specified with the 'debug_view' structure. There are defined +callback functions which are used for reading and writing the debugfs files: + +.. code-block:: c + + struct debug_view { + char name[DEBUG_MAX_PROCF_LEN]; + debug_prolog_proc_t* prolog_proc; + debug_header_proc_t* header_proc; + debug_format_proc_t* format_proc; + debug_input_proc_t* input_proc; + void* private_data; + }; + +where: + +.. code-block:: c + + typedef int (debug_header_proc_t) (debug_info_t* id, + struct debug_view* view, + int area, + debug_entry_t* entry, + char* out_buf); + + typedef int (debug_format_proc_t) (debug_info_t* id, + struct debug_view* view, char* out_buf, + const char* in_buf); + typedef int (debug_prolog_proc_t) (debug_info_t* id, + struct debug_view* view, + char* out_buf); + typedef int (debug_input_proc_t) (debug_info_t* id, + struct debug_view* view, + struct file* file, const char* user_buf, + size_t in_buf_size, loff_t* offset); + + +The "private_data" member can be used as pointer to view specific data. +It is not used by the debug feature itself. + +The output when reading a debugfs file is structured like this:: + + "prolog_proc output" + + "header_proc output 1" "format_proc output 1" + "header_proc output 2" "format_proc output 2" + "header_proc output 3" "format_proc output 3" + ... + +When a view is read from the debugfs, the Debug Feature calls the +'prolog_proc' once for writing the prolog. +Then 'header_proc' and 'format_proc' are called for each +existing debug entry. + +The input_proc can be used to implement functionality when it is written to +the view (e.g. like with ``echo "0" > /sys/kernel/debug/s390dbf/dasd/level``). + +For header_proc there can be used the default function +:c:func:`debug_dflt_header_fn()` which is defined in debug.h. +and which produces the same header output as the predefined views. +E.g:: + + 00 00964419409:440761 2 - 00 88023ec + +In order to see how to use the callback functions check the implementation +of the default views! + +Example: + +.. code-block:: c + + #include + + #define UNKNOWNSTR "data: %08x" + + const char* messages[] = + {"This error...........\n", + "That error...........\n", + "Problem..............\n", + "Something went wrong.\n", + "Everything ok........\n", + NULL + }; + + static int debug_test_format_fn( + debug_info_t *id, struct debug_view *view, + char *out_buf, const char *in_buf + ) + { + int i, rc = 0; + + if (id->buf_size >= 4) { + int msg_nr = *((int*)in_buf); + if (msg_nr < sizeof(messages) / sizeof(char*) - 1) + rc += sprintf(out_buf, "%s", messages[msg_nr]); + else + rc += sprintf(out_buf, UNKNOWNSTR, msg_nr); + } + return rc; + } + + struct debug_view debug_test_view = { + "myview", /* name of view */ + NULL, /* no prolog */ + &debug_dflt_header_fn, /* default header for each entry */ + &debug_test_format_fn, /* our own format function */ + NULL, /* no input function */ + NULL /* no private data */ + }; + +test: +===== + +.. code-block:: c + + debug_info_t *debug_info; + int i; + ... + debug_info = debug_register("test", 0, 4, 4); + debug_register_view(debug_info, &debug_test_view); + for (i = 0; i < 10; i ++) + debug_int_event(debug_info, 1, i); + +:: + + > cat /sys/kernel/debug/s390dbf/test/myview + 00 00964419734:611402 1 - 00 88042ca This error........... + 00 00964419734:611405 1 - 00 88042ca That error........... + 00 00964419734:611408 1 - 00 88042ca Problem.............. + 00 00964419734:611411 1 - 00 88042ca Something went wrong. + 00 00964419734:611414 1 - 00 88042ca Everything ok........ + 00 00964419734:611417 1 - 00 88042ca data: 00000005 + 00 00964419734:611419 1 - 00 88042ca data: 00000006 + 00 00964419734:611422 1 - 00 88042ca data: 00000007 + 00 00964419734:611425 1 - 00 88042ca data: 00000008 + 00 00964419734:611428 1 - 00 88042ca data: 00000009 diff --git a/Documentation/arch/s390/text_files.rst b/Documentation/arch/s390/text_files.rst new file mode 100644 index 000000000000..c94d05d4fa17 --- /dev/null +++ b/Documentation/arch/s390/text_files.rst @@ -0,0 +1,11 @@ +ibm 3270 changelog +------------------ + +.. include:: 3270.ChangeLog + :literal: + +ibm 3270 config3270.sh +---------------------- + +.. literalinclude:: config3270.sh + :language: shell diff --git a/Documentation/arch/s390/vfio-ap-locking.rst b/Documentation/arch/s390/vfio-ap-locking.rst new file mode 100644 index 000000000000..0dfcdb562e21 --- /dev/null +++ b/Documentation/arch/s390/vfio-ap-locking.rst @@ -0,0 +1,115 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +VFIO AP Locks Overview +====================== +This document describes the locks that are pertinent to the secure operation +of the vfio_ap device driver. Throughout this document, the following variables +will be used to denote instances of the structures herein described: + +.. code-block:: c + + struct ap_matrix_dev *matrix_dev; + struct ap_matrix_mdev *matrix_mdev; + struct kvm *kvm; + +The Matrix Devices Lock (drivers/s390/crypto/vfio_ap_private.h) +--------------------------------------------------------------- + +.. code-block:: c + + struct ap_matrix_dev { + ... + struct list_head mdev_list; + struct mutex mdevs_lock; + ... + } + +The Matrix Devices Lock (matrix_dev->mdevs_lock) is implemented as a global +mutex contained within the single object of struct ap_matrix_dev. This lock +controls access to all fields contained within each matrix_mdev +(matrix_dev->mdev_list). This lock must be held while reading from, writing to +or using the data from a field contained within a matrix_mdev instance +representing one of the vfio_ap device driver's mediated devices. + +The KVM Lock (include/linux/kvm_host.h) +--------------------------------------- + +.. code-block:: c + + struct kvm { + ... + struct mutex lock; + ... + } + +The KVM Lock (kvm->lock) controls access to the state data for a KVM guest. This +lock must be held by the vfio_ap device driver while one or more AP adapters, +domains or control domains are being plugged into or unplugged from the guest. + +The KVM pointer is stored in the in the matrix_mdev instance +(matrix_mdev->kvm = kvm) containing the state of the mediated device that has +been attached to the KVM guest. + +The Guests Lock (drivers/s390/crypto/vfio_ap_private.h) +----------------------------------------------------------- + +.. code-block:: c + + struct ap_matrix_dev { + ... + struct list_head mdev_list; + struct mutex guests_lock; + ... + } + +The Guests Lock (matrix_dev->guests_lock) controls access to the +matrix_mdev instances (matrix_dev->mdev_list) that represent mediated devices +that hold the state for the mediated devices that have been attached to a +KVM guest. This lock must be held: + +1. To control access to the KVM pointer (matrix_mdev->kvm) while the vfio_ap + device driver is using it to plug/unplug AP devices passed through to the KVM + guest. + +2. To add matrix_mdev instances to or remove them from matrix_dev->mdev_list. + This is necessary to ensure the proper locking order when the list is perused + to find an ap_matrix_mdev instance for the purpose of plugging/unplugging + AP devices passed through to a KVM guest. + + For example, when a queue device is removed from the vfio_ap device driver, + if the adapter is passed through to a KVM guest, it will have to be + unplugged. In order to figure out whether the adapter is passed through, + the matrix_mdev object to which the queue is assigned will have to be + found. The KVM pointer (matrix_mdev->kvm) can then be used to determine if + the mediated device is passed through (matrix_mdev->kvm != NULL) and if so, + to unplug the adapter. + +It is not necessary to take the Guests Lock to access the KVM pointer if the +pointer is not used to plug/unplug devices passed through to the KVM guest; +however, in this case, the Matrix Devices Lock (matrix_dev->mdevs_lock) must be +held in order to access the KVM pointer since it is set and cleared under the +protection of the Matrix Devices Lock. A case in point is the function that +handles interception of the PQAP(AQIC) instruction sub-function. This handler +needs to access the KVM pointer only for the purposes of setting or clearing IRQ +resources, so only the matrix_dev->mdevs_lock needs to be held. + +The PQAP Hook Lock (arch/s390/include/asm/kvm_host.h) +----------------------------------------------------- + +.. code-block:: c + + typedef int (*crypto_hook)(struct kvm_vcpu *vcpu); + + struct kvm_s390_crypto { + ... + struct rw_semaphore pqap_hook_rwsem; + crypto_hook *pqap_hook; + ... + }; + +The PQAP Hook Lock is a r/w semaphore that controls access to the function +pointer of the handler ``(*kvm->arch.crypto.pqap_hook)`` to invoke when the +PQAP(AQIC) instruction sub-function is intercepted by the host. The lock must be +held in write mode when pqap_hook value is set, and in read mode when the +pqap_hook function is called. diff --git a/Documentation/arch/s390/vfio-ap.rst b/Documentation/arch/s390/vfio-ap.rst new file mode 100644 index 000000000000..bb3f4c4e2885 --- /dev/null +++ b/Documentation/arch/s390/vfio-ap.rst @@ -0,0 +1,1069 @@ +=============================== +Adjunct Processor (AP) facility +=============================== + + +Introduction +============ +The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised +of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards. +The AP devices provide cryptographic functions to all CPUs assigned to a +linux system running in an IBM Z system LPAR. + +The AP adapter cards are exposed via the AP bus. The motivation for vfio-ap +is to make AP cards available to KVM guests using the VFIO mediated device +framework. This implementation relies considerably on the s390 virtualization +facilities which do most of the hard work of providing direct access to AP +devices. + +AP Architectural Overview +========================= +To facilitate the comprehension of the design, let's start with some +definitions: + +* AP adapter + + An AP adapter is an IBM Z adapter card that can perform cryptographic + functions. There can be from 0 to 256 adapters assigned to an LPAR. Adapters + assigned to the LPAR in which a linux host is running will be available to + the linux host. Each adapter is identified by a number from 0 to 255; however, + the maximum adapter number is determined by machine model and/or adapter type. + When installed, an AP adapter is accessed by AP instructions executed by any + CPU. + + The AP adapter cards are assigned to a given LPAR via the system's Activation + Profile which can be edited via the HMC. When the linux host system is IPL'd + in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and + creates a sysfs device for each assigned adapter. For example, if AP adapters + 4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following + sysfs device entries:: + + /sys/devices/ap/card04 + /sys/devices/ap/card0a + + Symbolic links to these devices will also be created in the AP bus devices + sub-directory:: + + /sys/bus/ap/devices/[card04] + /sys/bus/ap/devices/[card04] + +* AP domain + + An adapter is partitioned into domains. An adapter can hold up to 256 domains + depending upon the adapter type and hardware configuration. A domain is + identified by a number from 0 to 255; however, the maximum domain number is + determined by machine model and/or adapter type.. A domain can be thought of + as a set of hardware registers and memory used for processing AP commands. A + domain can be configured with a secure private key used for clear key + encryption. A domain is classified in one of two ways depending upon how it + may be accessed: + + * Usage domains are domains that are targeted by an AP instruction to + process an AP command. + + * Control domains are domains that are changed by an AP command sent to a + usage domain; for example, to set the secure private key for the control + domain. + + The AP usage and control domains are assigned to a given LPAR via the system's + Activation Profile which can be edited via the HMC. When a linux host system + is IPL'd in the LPAR, the AP bus module detects the AP usage and control + domains assigned to the LPAR. The domain number of each usage domain and + adapter number of each AP adapter are combined to create AP queue devices + (see AP Queue section below). The domain number of each control domain will be + represented in a bitmask and stored in a sysfs file + /sys/bus/ap/ap_control_domain_mask. The bits in the mask, from most to least + significant bit, correspond to domains 0-255. + +* AP Queue + + An AP queue is the means by which an AP command is sent to a usage domain + inside a specific adapter. An AP queue is identified by a tuple + comprised of an AP adapter ID (APID) and an AP queue index (APQI). The + APQI corresponds to a given usage domain number within the adapter. This tuple + forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP + instructions include a field containing the APQN to identify the AP queue to + which the AP command is to be sent for processing. + + The AP bus will create a sysfs device for each APQN that can be derived from + the cross product of the AP adapter and usage domain numbers detected when the + AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage + domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the + following sysfs entries:: + + /sys/devices/ap/card04/04.0006 + /sys/devices/ap/card04/04.0047 + /sys/devices/ap/card0a/0a.0006 + /sys/devices/ap/card0a/0a.0047 + + The following symbolic links to these devices will be created in the AP bus + devices subdirectory:: + + /sys/bus/ap/devices/[04.0006] + /sys/bus/ap/devices/[04.0047] + /sys/bus/ap/devices/[0a.0006] + /sys/bus/ap/devices/[0a.0047] + +* AP Instructions: + + There are three AP instructions: + + * NQAP: to enqueue an AP command-request message to a queue + * DQAP: to dequeue an AP command-reply message from a queue + * PQAP: to administer the queues + + AP instructions identify the domain that is targeted to process the AP + command; this must be one of the usage domains. An AP command may modify a + domain that is not one of the usage domains, but the modified domain + must be one of the control domains. + +AP and SIE +========== +Let's now take a look at how AP instructions executed on a guest are interpreted +by the hardware. + +A satellite control block called the Crypto Control Block (CRYCB) is attached to +our main hardware virtualization control block. The CRYCB contains an AP Control +Block (APCB) that has three fields to identify the adapters, usage domains and +control domains assigned to the KVM guest: + +* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned + to the KVM guest. Each bit in the mask, from left to right, corresponds to + an APID from 0-255. If a bit is set, the corresponding adapter is valid for + use by the KVM guest. + +* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains + assigned to the KVM guest. Each bit in the mask, from left to right, + corresponds to an AP queue index (APQI) from 0-255. If a bit is set, the + corresponding queue is valid for use by the KVM guest. + +* The AP Domain Mask field is a bit mask that identifies the AP control domains + assigned to the KVM guest. The ADM bit mask controls which domains can be + changed by an AP command-request message sent to a usage domain from the + guest. Each bit in the mask, from left to right, corresponds to a domain from + 0-255. If a bit is set, the corresponding domain can be modified by an AP + command-request message sent to a usage domain. + +If you recall from the description of an AP Queue, AP instructions include +an APQN to identify the AP queue to which an AP command-request message is to be +sent (NQAP and PQAP instructions), or from which a command-reply message is to +be received (DQAP instruction). The validity of an APQN is defined by the matrix +calculated from the APM and AQM; it is the Cartesian product of all assigned +adapter numbers (APM) with all assigned queue indexes (AQM). For example, if +adapters 1 and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs +(1,5), (1,6), (2,5) and (2,6) will be valid for the guest. + +The APQNs can provide secure key functionality - i.e., a private key is stored +on the adapter card for each of its domains - so each APQN must be assigned to +at most one guest or to the linux host:: + + Example 1: Valid configuration: + ------------------------------ + Guest1: adapters 1,2 domains 5,6 + Guest2: adapter 1,2 domain 7 + + This is valid because both guests have a unique set of APQNs: + Guest1 has APQNs (1,5), (1,6), (2,5), (2,6); + Guest2 has APQNs (1,7), (2,7) + + Example 2: Valid configuration: + ------------------------------ + Guest1: adapters 1,2 domains 5,6 + Guest2: adapters 3,4 domains 5,6 + + This is also valid because both guests have a unique set of APQNs: + Guest1 has APQNs (1,5), (1,6), (2,5), (2,6); + Guest2 has APQNs (3,5), (3,6), (4,5), (4,6) + + Example 3: Invalid configuration: + -------------------------------- + Guest1: adapters 1,2 domains 5,6 + Guest2: adapter 1 domains 6,7 + + This is an invalid configuration because both guests have access to + APQN (1,6). + +The Design +========== +The design introduces three new objects: + +1. AP matrix device +2. VFIO AP device driver (vfio_ap.ko) +3. VFIO AP mediated pass-through device + +The VFIO AP device driver +------------------------- +The VFIO AP (vfio_ap) device driver serves the following purposes: + +1. Provides the interfaces to secure APQNs for exclusive use of KVM guests. + +2. Sets up the VFIO mediated device interfaces to manage a vfio_ap mediated + device and creates the sysfs interfaces for assigning adapters, usage + domains, and control domains comprising the matrix for a KVM guest. + +3. Configures the APM, AQM and ADM in the APCB contained in the CRYCB referenced + by a KVM guest's SIE state description to grant the guest access to a matrix + of AP devices + +Reserve APQNs for exclusive use of KVM guests +--------------------------------------------- +The following block diagram illustrates the mechanism by which APQNs are +reserved:: + + +------------------+ + 7 remove | | + +--------------------> cex4queue driver | + | | | + | +------------------+ + | + | + | +------------------+ +----------------+ + | 5 register driver | | 3 create | | + | +----------------> Device core +----------> matrix device | + | | | | | | + | | +--------^---------+ +----------------+ + | | | + | | +-------------------+ + | | +-----------------------------------+ | + | | | 4 register AP driver | | 2 register device + | | | | | + +--------+---+-v---+ +--------+-------+-+ + | | | | + | ap_bus +--------------------- > vfio_ap driver | + | | 8 probe | | + +--------^---------+ +--^--^------------+ + 6 edit | | | + apmask | +-----------------------------+ | 11 mdev create + aqmask | | 1 modprobe | + +--------+-----+---+ +----------------+-+ +----------------+ + | | | |10 create| mediated | + | admin | | VFIO device core |---------> matrix | + | + | | | device | + +------+-+---------+ +--------^---------+ +--------^-------+ + | | | | + | | 9 create vfio_ap-passthrough | | + | +------------------------------+ | + +-------------------------------------------------------------+ + 12 assign adapter/domain/control domain + +The process for reserving an AP queue for use by a KVM guest is: + +1. The administrator loads the vfio_ap device driver +2. The vfio-ap driver during its initialization will register a single 'matrix' + device with the device core. This will serve as the parent device for + all vfio_ap mediated devices used to configure an AP matrix for a guest. +3. The /sys/devices/vfio_ap/matrix device is created by the device core +4. The vfio_ap device driver will register with the AP bus for AP queue devices + of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap + driver's probe and remove callback interfaces. Devices older than CEX4 queues + are not supported to simplify the implementation by not needlessly + complicating the design by supporting older devices that will go out of + service in the relatively near future, and for which there are few older + systems around on which to test. +5. The AP bus registers the vfio_ap device driver with the device core +6. The administrator edits the AP adapter and queue masks to reserve AP queues + for use by the vfio_ap device driver. +7. The AP bus removes the AP queues reserved for the vfio_ap driver from the + default zcrypt cex4queue driver. +8. The AP bus probes the vfio_ap device driver to bind the queues reserved for + it. +9. The administrator creates a passthrough type vfio_ap mediated device to be + used by a guest +10. The administrator assigns the adapters, usage domains and control domains + to be exclusively used by a guest. + +Set up the VFIO mediated device interfaces +------------------------------------------ +The VFIO AP device driver utilizes the common interfaces of the VFIO mediated +device core driver to: + +* Register an AP mediated bus driver to add a vfio_ap mediated device to and + remove it from a VFIO group. +* Create and destroy a vfio_ap mediated device +* Add a vfio_ap mediated device to and remove it from the AP mediated bus driver +* Add a vfio_ap mediated device to and remove it from an IOMMU group + +The following high-level block diagram shows the main components and interfaces +of the VFIO AP mediated device driver:: + + +-------------+ + | | + | +---------+ | mdev_register_driver() +--------------+ + | | Mdev | +<-----------------------+ | + | | bus | | | vfio_mdev.ko | + | | driver | +----------------------->+ |<-> VFIO user + | +---------+ | probe()/remove() +--------------+ APIs + | | + | MDEV CORE | + | MODULE | + | mdev.ko | + | +---------+ | mdev_register_parent() +--------------+ + | |Physical | +<-----------------------+ | + | | device | | | vfio_ap.ko |<-> matrix + | |interface| +----------------------->+ | device + | +---------+ | callback +--------------+ + +-------------+ + +During initialization of the vfio_ap module, the matrix device is registered +with an 'mdev_parent_ops' structure that provides the sysfs attribute +structures, mdev functions and callback interfaces for managing the mediated +matrix device. + +* sysfs attribute structures: + + supported_type_groups + The VFIO mediated device framework supports creation of user-defined + mediated device types. These mediated device types are specified + via the 'supported_type_groups' structure when a device is registered + with the mediated device framework. The registration process creates the + sysfs structures for each mediated device type specified in the + 'mdev_supported_types' sub-directory of the device being registered. Along + with the device type, the sysfs attributes of the mediated device type are + provided. + + The VFIO AP device driver will register one mediated device type for + passthrough devices: + + /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough + + Only the read-only attributes required by the VFIO mdev framework will + be provided:: + + ... name + ... device_api + ... available_instances + ... device_api + + Where: + + * name: + specifies the name of the mediated device type + * device_api: + the mediated device type's API + * available_instances: + the number of vfio_ap mediated passthrough devices + that can be created + * device_api: + specifies the VFIO API + mdev_attr_groups + This attribute group identifies the user-defined sysfs attributes of the + mediated device. When a device is registered with the VFIO mediated device + framework, the sysfs attribute files identified in the 'mdev_attr_groups' + structure will be created in the vfio_ap mediated device's directory. The + sysfs attributes for a vfio_ap mediated device are: + + assign_adapter / unassign_adapter: + Write-only attributes for assigning/unassigning an AP adapter to/from the + vfio_ap mediated device. To assign/unassign an adapter, the APID of the + adapter is echoed into the respective attribute file. + assign_domain / unassign_domain: + Write-only attributes for assigning/unassigning an AP usage domain to/from + the vfio_ap mediated device. To assign/unassign a domain, the domain + number of the usage domain is echoed into the respective attribute + file. + matrix: + A read-only file for displaying the APQNs derived from the Cartesian + product of the adapter and domain numbers assigned to the vfio_ap mediated + device. + guest_matrix: + A read-only file for displaying the APQNs derived from the Cartesian + product of the adapter and domain numbers assigned to the APM and AQM + fields respectively of the KVM guest's CRYCB. This may differ from the + the APQNs assigned to the vfio_ap mediated device if any APQN does not + reference a queue device bound to the vfio_ap device driver (i.e., the + queue is not in the host's AP configuration). + assign_control_domain / unassign_control_domain: + Write-only attributes for assigning/unassigning an AP control domain + to/from the vfio_ap mediated device. To assign/unassign a control domain, + the ID of the domain to be assigned/unassigned is echoed into the + respective attribute file. + control_domains: + A read-only file for displaying the control domain numbers assigned to the + vfio_ap mediated device. + +* functions: + + create: + allocates the ap_matrix_mdev structure used by the vfio_ap driver to: + + * Store the reference to the KVM structure for the guest using the mdev + * Store the AP matrix configuration for the adapters, domains, and control + domains assigned via the corresponding sysfs attributes files + * Store the AP matrix configuration for the adapters, domains and control + domains available to a guest. A guest may not be provided access to APQNs + referencing queue devices that do not exist, or are not bound to the + vfio_ap device driver. + + remove: + deallocates the vfio_ap mediated device's ap_matrix_mdev structure. + This will be allowed only if a running guest is not using the mdev. + +* callback interfaces + + open_device: + The vfio_ap driver uses this callback to register a + VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the matrix mdev + devices. The open_device callback is invoked by userspace to connect the + VFIO iommu group for the matrix mdev device to the MDEV bus. Access to the + KVM structure used to configure the KVM guest is provided via this callback. + The KVM structure, is used to configure the guest's access to the AP matrix + defined via the vfio_ap mediated device's sysfs attribute files. + + close_device: + unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the + matrix mdev device and deconfigures the guest's AP matrix. + + ioctl: + this callback handles the VFIO_DEVICE_GET_INFO and VFIO_DEVICE_RESET ioctls + defined by the vfio framework. + +Configure the guest's AP resources +---------------------------------- +Configuring the AP resources for a KVM guest will be performed when the +VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier +function is called when userspace connects to KVM. The guest's AP resources are +configured via it's APCB by: + +* Setting the bits in the APM corresponding to the APIDs assigned to the + vfio_ap mediated device via its 'assign_adapter' interface. +* Setting the bits in the AQM corresponding to the domains assigned to the + vfio_ap mediated device via its 'assign_domain' interface. +* Setting the bits in the ADM corresponding to the domain dIDs assigned to the + vfio_ap mediated device via its 'assign_control_domains' interface. + +The linux device model precludes passing a device through to a KVM guest that +is not bound to the device driver facilitating its pass-through. Consequently, +an APQN that does not reference a queue device bound to the vfio_ap device +driver will not be assigned to a KVM guest's matrix. The AP architecture, +however, does not provide a means to filter individual APQNs from the guest's +matrix, so the adapters, domains and control domains assigned to vfio_ap +mediated device via its sysfs 'assign_adapter', 'assign_domain' and +'assign_control_domain' interfaces will be filtered before providing the AP +configuration to a guest: + +* The APIDs of the adapters, the APQIs of the domains and the domain numbers of + the control domains assigned to the matrix mdev that are not also assigned to + the host's AP configuration will be filtered. + +* Each APQN derived from the Cartesian product of the APIDs and APQIs assigned + to the vfio_ap mdev is examined and if any one of them does not reference a + queue device bound to the vfio_ap device driver, the adapter will not be + plugged into the guest (i.e., the bit corresponding to its APID will not be + set in the APM of the guest's APCB). + +The CPU model features for AP +----------------------------- +The AP stack relies on the presence of the AP instructions as well as three +facilities: The AP Facilities Test (APFT) facility; the AP Query +Configuration Information (QCI) facility; and the AP Queue Interruption Control +facility. These features/facilities are made available to a KVM guest via the +following CPU model features: + +1. ap: Indicates whether the AP instructions are installed on the guest. This + feature will be enabled by KVM only if the AP instructions are installed + on the host. + +2. apft: Indicates the APFT facility is available on the guest. This facility + can be made available to the guest only if it is available on the host (i.e., + facility bit 15 is set). + +3. apqci: Indicates the AP QCI facility is available on the guest. This facility + can be made available to the guest only if it is available on the host (i.e., + facility bit 12 is set). + +4. apqi: Indicates AP Queue Interruption Control faclity is available on the + guest. This facility can be made available to the guest only if it is + available on the host (i.e., facility bit 65 is set). + +Note: If the user chooses to specify a CPU model different than the 'host' +model to QEMU, the CPU model features and facilities need to be turned on +explicitly; for example:: + + /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on,apqi=on + +A guest can be precluded from using AP features/facilities by turning them off +explicitly; for example:: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off,apqi=off + +Note: If the APFT facility is turned off (apft=off) for the guest, the guest +will not see any AP devices. The zcrypt device drivers on the guest that +register for type 10 and newer AP devices - i.e., the cex4card and cex4queue +device drivers - need the APFT facility to ascertain the facilities installed on +a given AP device. If the APFT facility is not installed on the guest, then no +adapter or domain devices will get created by the AP bus running on the +guest because only type 10 and newer devices can be configured for guest use. + +Example +======= +Let's now provide an example to illustrate how KVM guests may be given +access to AP facilities. For this example, we will show how to configure +three guests such that executing the lszcrypt command on the guests would +look like this: + +Guest1 +------ +=========== ===== ============ +CARD.DOMAIN TYPE MODE +=========== ===== ============ +05 CEX5C CCA-Coproc +05.0004 CEX5C CCA-Coproc +05.00ab CEX5C CCA-Coproc +06 CEX5A Accelerator +06.0004 CEX5A Accelerator +06.00ab CEX5A Accelerator +=========== ===== ============ + +Guest2 +------ +=========== ===== ============ +CARD.DOMAIN TYPE MODE +=========== ===== ============ +05 CEX5C CCA-Coproc +05.0047 CEX5C CCA-Coproc +05.00ff CEX5C CCA-Coproc +=========== ===== ============ + +Guest3 +------ +=========== ===== ============ +CARD.DOMAIN TYPE MODE +=========== ===== ============ +06 CEX5A Accelerator +06.0047 CEX5A Accelerator +06.00ff CEX5A Accelerator +=========== ===== ============ + +These are the steps: + +1. Install the vfio_ap module on the linux host. The dependency chain for the + vfio_ap module is: + * iommu + * s390 + * zcrypt + * vfio + * vfio_mdev + * vfio_mdev_device + * KVM + + To build the vfio_ap module, the kernel build must be configured with the + following Kconfig elements selected: + * IOMMU_SUPPORT + * S390 + * ZCRYPT + * VFIO + * KVM + + If using make menuconfig select the following to build the vfio_ap module:: + + -> Device Drivers + -> IOMMU Hardware Support + select S390 AP IOMMU Support + -> VFIO Non-Privileged userspace driver framework + -> Mediated device driver frramework + -> VFIO driver for Mediated devices + -> I/O subsystem + -> VFIO support for AP devices + +2. Secure the AP queues to be used by the three guests so that the host can not + access them. To secure them, there are two sysfs files that specify + bitmasks marking a subset of the APQN range as usable only by the default AP + queue device drivers. All remaining APQNs are available for use by + any other device driver. The vfio_ap device driver is currently the only + non-default device driver. The location of the sysfs files containing the + masks are:: + + /sys/bus/ap/apmask + /sys/bus/ap/aqmask + + The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs + (APID). Each bit in the mask, from left to right, corresponds to an APID from + 0-255. If a bit is set, the APID belongs to the subset of APQNs marked as + available only to the default AP queue device drivers. + + The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes + (APQI). Each bit in the mask, from left to right, corresponds to an APQI from + 0-255. If a bit is set, the APQI belongs to the subset of APQNs marked as + available only to the default AP queue device drivers. + + The Cartesian product of the APIDs corresponding to the bits set in the + apmask and the APQIs corresponding to the bits set in the aqmask comprise + the subset of APQNs that can be used only by the host default device drivers. + All other APQNs are available to the non-default device drivers such as the + vfio_ap driver. + + Take, for example, the following masks:: + + apmask: + 0x7d00000000000000000000000000000000000000000000000000000000000000 + + aqmask: + 0x8000000000000000000000000000000000000000000000000000000000000000 + + The masks indicate: + + * Adapters 1, 2, 3, 4, 5, and 7 are available for use by the host default + device drivers. + + * Domain 0 is available for use by the host default device drivers + + * The subset of APQNs available for use only by the default host device + drivers are: + + (1,0), (2,0), (3,0), (4.0), (5,0) and (7,0) + + * All other APQNs are available for use by the non-default device drivers. + + The APQN of each AP queue device assigned to the linux host is checked by the + AP bus against the set of APQNs derived from the Cartesian product of APIDs + and APQIs marked as available to the default AP queue device drivers. If a + match is detected, only the default AP queue device drivers will be probed; + otherwise, the vfio_ap device driver will be probed. + + By default, the two masks are set to reserve all APQNs for use by the default + AP queue device drivers. There are two ways the default masks can be changed: + + 1. The sysfs mask files can be edited by echoing a string into the + respective sysfs mask file in one of two formats: + + * An absolute hex string starting with 0x - like "0x12345678" - sets + the mask. If the given string is shorter than the mask, it is padded + with 0s on the right; for example, specifying a mask value of 0x41 is + the same as specifying:: + + 0x4100000000000000000000000000000000000000000000000000000000000000 + + Keep in mind that the mask reads from left to right, so the mask + above identifies device numbers 1 and 7 (01000001). + + If the string is longer than the mask, the operation is terminated with + an error (EINVAL). + + * Individual bits in the mask can be switched on and off by specifying + each bit number to be switched in a comma separated list. Each bit + number string must be prepended with a ('+') or minus ('-') to indicate + the corresponding bit is to be switched on ('+') or off ('-'). Some + valid values are: + + - "+0" switches bit 0 on + - "-13" switches bit 13 off + - "+0x41" switches bit 65 on + - "-0xff" switches bit 255 off + + The following example: + + +0,-6,+0x47,-0xf0 + + Switches bits 0 and 71 (0x47) on + + Switches bits 6 and 240 (0xf0) off + + Note that the bits not specified in the list remain as they were before + the operation. + + 2. The masks can also be changed at boot time via parameters on the kernel + command line like this: + + ap.apmask=0xffff ap.aqmask=0x40 + + This would create the following masks:: + + apmask: + 0xffff000000000000000000000000000000000000000000000000000000000000 + + aqmask: + 0x4000000000000000000000000000000000000000000000000000000000000000 + + Resulting in these two pools:: + + default drivers pool: adapter 0-15, domain 1 + alternate drivers pool: adapter 16-255, domains 0, 2-255 + + **Note:** + Changing a mask such that one or more APQNs will be taken from a vfio_ap + mediated device (see below) will fail with an error (EBUSY). A message + is logged to the kernel ring buffer which can be viewed with the 'dmesg' + command. The output identifies each APQN flagged as 'in use' and identifies + the vfio_ap mediated device to which it is assigned; for example: + + Userspace may not re-assign queue 05.0054 already assigned to 62177883-f1bb-47f0-914d-32a22e3a8804 + Userspace may not re-assign queue 04.0054 already assigned to cef03c3c-903d-4ecc-9a83-40694cb8aee4 + +Securing the APQNs for our example +---------------------------------- + To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047, + 06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding + APQNs can be removed from the default masks using either of the following + commands:: + + echo -5,-6 > /sys/bus/ap/apmask + + echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask + + Or the masks can be set as follows:: + + echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \ + > apmask + + echo 0xf7fffffffffffffffeffffffffffffffffffffffffeffffffffffffffffffffe \ + > aqmask + + This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, + 06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The + sysfs directory for the vfio_ap device driver will now contain symbolic links + to the AP queue devices bound to it:: + + /sys/bus/ap + ... [drivers] + ...... [vfio_ap] + ......... [05.0004] + ......... [05.0047] + ......... [05.00ab] + ......... [05.00ff] + ......... [06.0004] + ......... [06.0047] + ......... [06.00ab] + ......... [06.00ff] + + Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later) + can be bound to the vfio_ap device driver. The reason for this is to + simplify the implementation by not needlessly complicating the design by + supporting older devices that will go out of service in the relatively near + future and for which there are few older systems on which to test. + + The administrator, therefore, must take care to secure only AP queues that + can be bound to the vfio_ap device driver. The device type for a given AP + queue device can be read from the parent card's sysfs directory. For example, + to see the hardware type of the queue 05.0004: + + cat /sys/bus/ap/devices/card05/hwtype + + The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the + vfio_ap device driver. + +3. Create the mediated devices needed to configure the AP matrixes for the + three guests and to provide an interface to the vfio_ap driver for + use by the guests:: + + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] (passthrough vfio_ap mediated device type) + --------- create + --------- [devices] + + To create the mediated devices for the three guests:: + + uuidgen > create + uuidgen > create + uuidgen > create + + or + + echo $uuid1 > create + echo $uuid2 > create + echo $uuid3 > create + + This will create three mediated devices in the [devices] subdirectory named + after the UUID written to the create attribute file. We call them $uuid1, + $uuid2 and $uuid3 and this is the sysfs directory structure after creation:: + + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] + --------- [devices] + ------------ [$uuid1] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + --------------- unassign_control_domain + --------------- unassign_domain + + ------------ [$uuid2] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + ----------------unassign_control_domain + ----------------unassign_domain + + ------------ [$uuid3] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + ----------------unassign_control_domain + ----------------unassign_domain + + Note *****: The vfio_ap mdevs do not persist across reboots unless the + mdevctl tool is used to create and persist them. + +4. The administrator now needs to configure the matrixes for the mediated + devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3). + + This is how the matrix is configured for Guest1:: + + echo 5 > assign_adapter + echo 6 > assign_adapter + echo 4 > assign_domain + echo 0xab > assign_domain + + Control domains can similarly be assigned using the assign_control_domain + sysfs file. + + If a mistake is made configuring an adapter, domain or control domain, + you can use the unassign_xxx files to unassign the adapter, domain or + control domain. + + To display the matrix configuration for Guest1:: + + cat matrix + + To display the matrix that is or will be assigned to Guest1:: + + cat guest_matrix + + This is how the matrix is configured for Guest2:: + + echo 5 > assign_adapter + echo 0x47 > assign_domain + echo 0xff > assign_domain + + This is how the matrix is configured for Guest3:: + + echo 6 > assign_adapter + echo 0x47 > assign_domain + echo 0xff > assign_domain + + In order to successfully assign an adapter: + + * The adapter number specified must represent a value from 0 up to the + maximum adapter number configured for the system. If an adapter number + higher than the maximum is specified, the operation will terminate with + an error (ENODEV). + + Note: The maximum adapter number can be obtained via the sysfs + /sys/bus/ap/ap_max_adapter_id attribute file. + + * Each APQN derived from the Cartesian product of the APID of the adapter + being assigned and the APQIs of the domains previously assigned: + + - Must only be available to the vfio_ap device driver as specified in the + sysfs /sys/bus/ap/apmask and /sys/bus/ap/aqmask attribute files. If even + one APQN is reserved for use by the host device driver, the operation + will terminate with an error (EADDRNOTAVAIL). + + - Must NOT be assigned to another vfio_ap mediated device. If even one APQN + is assigned to another vfio_ap mediated device, the operation will + terminate with an error (EBUSY). + + - Must NOT be assigned while the sysfs /sys/bus/ap/apmask and + sys/bus/ap/aqmask attribute files are being edited or the operation may + terminate with an error (EBUSY). + + In order to successfully assign a domain: + + * The domain number specified must represent a value from 0 up to the + maximum domain number configured for the system. If a domain number + higher than the maximum is specified, the operation will terminate with + an error (ENODEV). + + Note: The maximum domain number can be obtained via the sysfs + /sys/bus/ap/ap_max_domain_id attribute file. + + * Each APQN derived from the Cartesian product of the APQI of the domain + being assigned and the APIDs of the adapters previously assigned: + + - Must only be available to the vfio_ap device driver as specified in the + sysfs /sys/bus/ap/apmask and /sys/bus/ap/aqmask attribute files. If even + one APQN is reserved for use by the host device driver, the operation + will terminate with an error (EADDRNOTAVAIL). + + - Must NOT be assigned to another vfio_ap mediated device. If even one APQN + is assigned to another vfio_ap mediated device, the operation will + terminate with an error (EBUSY). + + - Must NOT be assigned while the sysfs /sys/bus/ap/apmask and + sys/bus/ap/aqmask attribute files are being edited or the operation may + terminate with an error (EBUSY). + + In order to successfully assign a control domain: + + * The domain number specified must represent a value from 0 up to the maximum + domain number configured for the system. If a control domain number higher + than the maximum is specified, the operation will terminate with an + error (ENODEV). + +5. Start Guest1:: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on,apqi=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ... + +7. Start Guest2:: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on,apqi=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ... + +7. Start Guest3:: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on,apqi=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ... + +When the guest is shut down, the vfio_ap mediated devices may be removed. + +Using our example again, to remove the vfio_ap mediated device $uuid1:: + + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] + --------- [devices] + ------------ [$uuid1] + --------------- remove + +:: + + echo 1 > remove + +This will remove all of the matrix mdev device's sysfs structures including +the mdev device itself. To recreate and reconfigure the matrix mdev device, +all of the steps starting with step 3 will have to be performed again. Note +that the remove will fail if a guest using the vfio_ap mdev is still running. + +It is not necessary to remove a vfio_ap mdev, but one may want to +remove it if no guest will use it during the remaining lifetime of the linux +host. If the vfio_ap mdev is removed, one may want to also reconfigure +the pool of adapters and queues reserved for use by the default drivers. + +Hot plug/unplug support: +======================== +An adapter, domain or control domain may be hot plugged into a running KVM +guest by assigning it to the vfio_ap mediated device being used by the guest if +the following conditions are met: + +* The adapter, domain or control domain must also be assigned to the host's + AP configuration. + +* Each APQN derived from the Cartesian product comprised of the APID of the + adapter being assigned and the APQIs of the domains assigned must reference a + queue device bound to the vfio_ap device driver. + +* To hot plug a domain, each APQN derived from the Cartesian product + comprised of the APQI of the domain being assigned and the APIDs of the + adapters assigned must reference a queue device bound to the vfio_ap device + driver. + +An adapter, domain or control domain may be hot unplugged from a running KVM +guest by unassigning it from the vfio_ap mediated device being used by the +guest. + +Over-provisioning of AP queues for a KVM guest: +=============================================== +Over-provisioning is defined herein as the assignment of adapters or domains to +a vfio_ap mediated device that do not reference AP devices in the host's AP +configuration. The idea here is that when the adapter or domain becomes +available, it will be automatically hot-plugged into the KVM guest using +the vfio_ap mediated device to which it is assigned as long as each new APQN +resulting from plugging it in references a queue device bound to the vfio_ap +device driver. + +Limitations +=========== +Live guest migration is not supported for guests using AP devices without +intervention by a system administrator. Before a KVM guest can be migrated, +the vfio_ap mediated device must be removed. Unfortunately, it can not be +removed manually (i.e., echo 1 > /sys/devices/vfio_ap/matrix/$UUID/remove) while +the mdev is in use by a KVM guest. If the guest is being emulated by QEMU, +its mdev can be hot unplugged from the guest in one of two ways: + +1. If the KVM guest was started with libvirt, you can hot unplug the mdev via + the following commands: + + virsh detach-device + + For example, to hot unplug mdev 62177883-f1bb-47f0-914d-32a22e3a8804 from + the guest named 'my-guest': + + virsh detach-device my-guest ~/config/my-guest-hostdev.xml + + The contents of my-guest-hostdev.xml: + +.. code-block:: xml + + + +
+ + + + + virsh qemu-monitor-command --hmp "device-del " + + For example, to hot unplug the vfio_ap mediated device identified on the + qemu command line with 'id=hostdev0' from the guest named 'my-guest': + +.. code-block:: sh + + virsh qemu-monitor-command my-guest --hmp "device_del hostdev0" + +2. A vfio_ap mediated device can be hot unplugged by attaching the qemu monitor + to the guest and using the following qemu monitor command: + + (QEMU) device-del id= + + For example, to hot unplug the vfio_ap mediated device that was specified + on the qemu command line with 'id=hostdev0' when the guest was started: + + (QEMU) device-del id=hostdev0 + +After live migration of the KVM guest completes, an AP configuration can be +restored to the KVM guest by hot plugging a vfio_ap mediated device on the target +system into the guest in one of two ways: + +1. If the KVM guest was started with libvirt, you can hot plug a matrix mediated + device into the guest via the following virsh commands: + + virsh attach-device + + For example, to hot plug mdev 62177883-f1bb-47f0-914d-32a22e3a8804 into + the guest named 'my-guest': + + virsh attach-device my-guest ~/config/my-guest-hostdev.xml + + The contents of my-guest-hostdev.xml: + +.. code-block:: xml + + + +
+ + + + + virsh qemu-monitor-command --hmp \ + "device_add vfio-ap,sysfsdev=,id=" + + For example, to hot plug the vfio_ap mediated device + 62177883-f1bb-47f0-914d-32a22e3a8804 into the guest named 'my-guest' with + device-id hostdev0: + + virsh qemu-monitor-command my-guest --hmp \ + "device_add vfio-ap,\ + sysfsdev=/sys/devices/vfio_ap/matrix/62177883-f1bb-47f0-914d-32a22e3a8804,\ + id=hostdev0" + +2. A vfio_ap mediated device can be hot plugged by attaching the qemu monitor + to the guest and using the following qemu monitor command: + + (qemu) device_add "vfio-ap,sysfsdev=,id=" + + For example, to plug the vfio_ap mediated device + 62177883-f1bb-47f0-914d-32a22e3a8804 into the guest with the device-id + hostdev0: + + (QEMU) device-add "vfio-ap,\ + sysfsdev=/sys/devices/vfio_ap/matrix/62177883-f1bb-47f0-914d-32a22e3a8804,\ + id=hostdev0" diff --git a/Documentation/arch/s390/vfio-ccw.rst b/Documentation/arch/s390/vfio-ccw.rst new file mode 100644 index 000000000000..42960b7b0d70 --- /dev/null +++ b/Documentation/arch/s390/vfio-ccw.rst @@ -0,0 +1,445 @@ +================================== +vfio-ccw: the basic infrastructure +================================== + +Introduction +------------ + +Here we describe the vfio support for I/O subchannel devices for +Linux/s390. Motivation for vfio-ccw is to passthrough subchannels to a +virtual machine, while vfio is the means. + +Different than other hardware architectures, s390 has defined a unified +I/O access method, which is so called Channel I/O. It has its own access +patterns: + +- Channel programs run asynchronously on a separate (co)processor. +- The channel subsystem will access any memory designated by the caller + in the channel program directly, i.e. there is no iommu involved. + +Thus when we introduce vfio support for these devices, we realize it +with a mediated device (mdev) implementation. The vfio mdev will be +added to an iommu group, so as to make itself able to be managed by the +vfio framework. And we add read/write callbacks for special vfio I/O +regions to pass the channel programs from the mdev to its parent device +(the real I/O subchannel device) to do further address translation and +to perform I/O instructions. + +This document does not intend to explain the s390 I/O architecture in +every detail. More information/reference could be found here: + +- A good start to know Channel I/O in general: + https://en.wikipedia.org/wiki/Channel_I/O +- s390 architecture: + s390 Principles of Operation manual (IBM Form. No. SA22-7832) +- The existing QEMU code which implements a simple emulated channel + subsystem could also be a good reference. It makes it easier to follow + the flow. + qemu/hw/s390x/css.c + +For vfio mediated device framework: +- Documentation/driver-api/vfio-mediated-device.rst + +Motivation of vfio-ccw +---------------------- + +Typically, a guest virtualized via QEMU/KVM on s390 only sees +paravirtualized virtio devices via the "Virtio Over Channel I/O +(virtio-ccw)" transport. This makes virtio devices discoverable via +standard operating system algorithms for handling channel devices. + +However this is not enough. On s390 for the majority of devices, which +use the standard Channel I/O based mechanism, we also need to provide +the functionality of passing through them to a QEMU virtual machine. +This includes devices that don't have a virtio counterpart (e.g. tape +drives) or that have specific characteristics which guests want to +exploit. + +For passing a device to a guest, we want to use the same interface as +everybody else, namely vfio. We implement this vfio support for channel +devices via the vfio mediated device framework and the subchannel device +driver "vfio_ccw". + +Access patterns of CCW devices +------------------------------ + +s390 architecture has implemented a so called channel subsystem, that +provides a unified view of the devices physically attached to the +systems. Though the s390 hardware platform knows about a huge variety of +different peripheral attachments like disk devices (aka. DASDs), tapes, +communication controllers, etc. They can all be accessed by a well +defined access method and they are presenting I/O completion a unified +way: I/O interruptions. + +All I/O requires the use of channel command words (CCWs). A CCW is an +instruction to a specialized I/O channel processor. A channel program is +a sequence of CCWs which are executed by the I/O channel subsystem. To +issue a channel program to the channel subsystem, it is required to +build an operation request block (ORB), which can be used to point out +the format of the CCW and other control information to the system. The +operating system signals the I/O channel subsystem to begin executing +the channel program with a SSCH (start sub-channel) instruction. The +central processor is then free to proceed with non-I/O instructions +until interrupted. The I/O completion result is received by the +interrupt handler in the form of interrupt response block (IRB). + +Back to vfio-ccw, in short: + +- ORBs and channel programs are built in guest kernel (with guest + physical addresses). +- ORBs and channel programs are passed to the host kernel. +- Host kernel translates the guest physical addresses to real addresses + and starts the I/O with issuing a privileged Channel I/O instruction + (e.g SSCH). +- channel programs run asynchronously on a separate processor. +- I/O completion will be signaled to the host with I/O interruptions. + And it will be copied as IRB to user space to pass it back to the + guest. + +Physical vfio ccw device and its child mdev +------------------------------------------- + +As mentioned above, we realize vfio-ccw with a mdev implementation. + +Channel I/O does not have IOMMU hardware support, so the physical +vfio-ccw device does not have an IOMMU level translation or isolation. + +Subchannel I/O instructions are all privileged instructions. When +handling the I/O instruction interception, vfio-ccw has the software +policing and translation how the channel program is programmed before +it gets sent to hardware. + +Within this implementation, we have two drivers for two types of +devices: + +- The vfio_ccw driver for the physical subchannel device. + This is an I/O subchannel driver for the real subchannel device. It + realizes a group of callbacks and registers to the mdev framework as a + parent (physical) device. As a consequence, mdev provides vfio_ccw a + generic interface (sysfs) to create mdev devices. A vfio mdev could be + created by vfio_ccw then and added to the mediated bus. It is the vfio + device that added to an IOMMU group and a vfio group. + vfio_ccw also provides an I/O region to accept channel program + request from user space and store I/O interrupt result for user + space to retrieve. To notify user space an I/O completion, it offers + an interface to setup an eventfd fd for asynchronous signaling. + +- The vfio_mdev driver for the mediated vfio ccw device. + This is provided by the mdev framework. It is a vfio device driver for + the mdev that created by vfio_ccw. + It realizes a group of vfio device driver callbacks, adds itself to a + vfio group, and registers itself to the mdev framework as a mdev + driver. + It uses a vfio iommu backend that uses the existing map and unmap + ioctls, but rather than programming them into an IOMMU for a device, + it simply stores the translations for use by later requests. This + means that a device programmed in a VM with guest physical addresses + can have the vfio kernel convert that address to process virtual + address, pin the page and program the hardware with the host physical + address in one step. + For a mdev, the vfio iommu backend will not pin the pages during the + VFIO_IOMMU_MAP_DMA ioctl. Mdev framework will only maintain a database + of the iova<->vaddr mappings in this operation. And they export a + vfio_pin_pages and a vfio_unpin_pages interfaces from the vfio iommu + backend for the physical devices to pin and unpin pages by demand. + +Below is a high Level block diagram:: + + +-------------+ + | | + | +---------+ | mdev_register_driver() +--------------+ + | | Mdev | +<-----------------------+ | + | | bus | | | vfio_mdev.ko | + | | driver | +----------------------->+ |<-> VFIO user + | +---------+ | probe()/remove() +--------------+ APIs + | | + | MDEV CORE | + | MODULE | + | mdev.ko | + | +---------+ | mdev_register_parent() +--------------+ + | |Physical | +<-----------------------+ | + | | device | | | vfio_ccw.ko |<-> subchannel + | |interface| +----------------------->+ | device + | +---------+ | callback +--------------+ + +-------------+ + +The process of how these work together. + +1. vfio_ccw.ko drives the physical I/O subchannel, and registers the + physical device (with callbacks) to mdev framework. + When vfio_ccw probing the subchannel device, it registers device + pointer and callbacks to the mdev framework. Mdev related file nodes + under the device node in sysfs would be created for the subchannel + device, namely 'mdev_create', 'mdev_destroy' and + 'mdev_supported_types'. +2. Create a mediated vfio ccw device. + Use the 'mdev_create' sysfs file, we need to manually create one (and + only one for our case) mediated device. +3. vfio_mdev.ko drives the mediated ccw device. + vfio_mdev is also the vfio device driver. It will probe the mdev and + add it to an iommu_group and a vfio_group. Then we could pass through + the mdev to a guest. + + +VFIO-CCW Regions +---------------- + +The vfio-ccw driver exposes MMIO regions to accept requests from and return +results to userspace. + +vfio-ccw I/O region +------------------- + +An I/O region is used to accept channel program request from user +space and store I/O interrupt result for user space to retrieve. The +definition of the region is:: + + struct ccw_io_region { + #define ORB_AREA_SIZE 12 + __u8 orb_area[ORB_AREA_SIZE]; + #define SCSW_AREA_SIZE 12 + __u8 scsw_area[SCSW_AREA_SIZE]; + #define IRB_AREA_SIZE 96 + __u8 irb_area[IRB_AREA_SIZE]; + __u32 ret_code; + } __packed; + +This region is always available. + +While starting an I/O request, orb_area should be filled with the +guest ORB, and scsw_area should be filled with the SCSW of the Virtual +Subchannel. + +irb_area stores the I/O result. + +ret_code stores a return code for each access of the region. The following +values may occur: + +``0`` + The operation was successful. + +``-EOPNOTSUPP`` + The ORB specified transport mode or the + SCSW specified a function other than the start function. + +``-EIO`` + A request was issued while the device was not in a state ready to accept + requests, or an internal error occurred. + +``-EBUSY`` + The subchannel was status pending or busy, or a request is already active. + +``-EAGAIN`` + A request was being processed, and the caller should retry. + +``-EACCES`` + The channel path(s) used for the I/O were found to be not operational. + +``-ENODEV`` + The device was found to be not operational. + +``-EINVAL`` + The orb specified a chain longer than 255 ccws, or an internal error + occurred. + + +vfio-ccw cmd region +------------------- + +The vfio-ccw cmd region is used to accept asynchronous instructions +from userspace:: + + #define VFIO_CCW_ASYNC_CMD_HSCH (1 << 0) + #define VFIO_CCW_ASYNC_CMD_CSCH (1 << 1) + struct ccw_cmd_region { + __u32 command; + __u32 ret_code; + } __packed; + +This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD. + +Currently, CLEAR SUBCHANNEL and HALT SUBCHANNEL use this region. + +command specifies the command to be issued; ret_code stores a return code +for each access of the region. The following values may occur: + +``0`` + The operation was successful. + +``-ENODEV`` + The device was found to be not operational. + +``-EINVAL`` + A command other than halt or clear was specified. + +``-EIO`` + A request was issued while the device was not in a state ready to accept + requests. + +``-EAGAIN`` + A request was being processed, and the caller should retry. + +``-EBUSY`` + The subchannel was status pending or busy while processing a halt request. + +vfio-ccw schib region +--------------------- + +The vfio-ccw schib region is used to return Subchannel-Information +Block (SCHIB) data to userspace:: + + struct ccw_schib_region { + #define SCHIB_AREA_SIZE 52 + __u8 schib_area[SCHIB_AREA_SIZE]; + } __packed; + +This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_SCHIB. + +Reading this region triggers a STORE SUBCHANNEL to be issued to the +associated hardware. + +vfio-ccw crw region +--------------------- + +The vfio-ccw crw region is used to return Channel Report Word (CRW) +data to userspace:: + + struct ccw_crw_region { + __u32 crw; + __u32 pad; + } __packed; + +This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_CRW. + +Reading this region returns a CRW if one that is relevant for this +subchannel (e.g. one reporting changes in channel path state) is +pending, or all zeroes if not. If multiple CRWs are pending (including +possibly chained CRWs), reading this region again will return the next +one, until no more CRWs are pending and zeroes are returned. This is +similar to how STORE CHANNEL REPORT WORD works. + +vfio-ccw operation details +-------------------------- + +vfio-ccw follows what vfio-pci did on the s390 platform and uses +vfio-iommu-type1 as the vfio iommu backend. + +* CCW translation APIs + A group of APIs (start with `cp_`) to do CCW translation. The CCWs + passed in by a user space program are organized with their guest + physical memory addresses. These APIs will copy the CCWs into kernel + space, and assemble a runnable kernel channel program by updating the + guest physical addresses with their corresponding host physical addresses. + Note that we have to use IDALs even for direct-access CCWs, as the + referenced memory can be located anywhere, including above 2G. + +* vfio_ccw device driver + This driver utilizes the CCW translation APIs and introduces + vfio_ccw, which is the driver for the I/O subchannel devices you want + to pass through. + vfio_ccw implements the following vfio ioctls:: + + VFIO_DEVICE_GET_INFO + VFIO_DEVICE_GET_IRQ_INFO + VFIO_DEVICE_GET_REGION_INFO + VFIO_DEVICE_RESET + VFIO_DEVICE_SET_IRQS + + This provides an I/O region, so that the user space program can pass a + channel program to the kernel, to do further CCW translation before + issuing them to a real device. + This also provides the SET_IRQ ioctl to setup an event notifier to + notify the user space program the I/O completion in an asynchronous + way. + +The use of vfio-ccw is not limited to QEMU, while QEMU is definitely a +good example to get understand how these patches work. Here is a little +bit more detail how an I/O request triggered by the QEMU guest will be +handled (without error handling). + +Explanation: + +- Q1-Q7: QEMU side process. +- K1-K5: Kernel side process. + +Q1. + Get I/O region info during initialization. + +Q2. + Setup event notifier and handler to handle I/O completion. + +... ... + +Q3. + Intercept a ssch instruction. +Q4. + Write the guest channel program and ORB to the I/O region. + + K1. + Copy from guest to kernel. + K2. + Translate the guest channel program to a host kernel space + channel program, which becomes runnable for a real device. + K3. + With the necessary information contained in the orb passed in + by QEMU, issue the ccwchain to the device. + K4. + Return the ssch CC code. +Q5. + Return the CC code to the guest. + +... ... + + K5. + Interrupt handler gets the I/O result and write the result to + the I/O region. + K6. + Signal QEMU to retrieve the result. + +Q6. + Get the signal and event handler reads out the result from the I/O + region. +Q7. + Update the irb for the guest. + +Limitations +----------- + +The current vfio-ccw implementation focuses on supporting basic commands +needed to implement block device functionality (read/write) of DASD/ECKD +device only. Some commands may need special handling in the future, for +example, anything related to path grouping. + +DASD is a kind of storage device. While ECKD is a data recording format. +More information for DASD and ECKD could be found here: +https://en.wikipedia.org/wiki/Direct-access_storage_device +https://en.wikipedia.org/wiki/Count_key_data + +Together with the corresponding work in QEMU, we can bring the passed +through DASD/ECKD device online in a guest now and use it as a block +device. + +The current code allows the guest to start channel programs via +START SUBCHANNEL, and to issue HALT SUBCHANNEL, CLEAR SUBCHANNEL, +and STORE SUBCHANNEL. + +Currently all channel programs are prefetched, regardless of the +p-bit setting in the ORB. As a result, self modifying channel +programs are not supported. For this reason, IPL has to be handled as +a special case by a userspace/guest program; this has been implemented +in QEMU's s390-ccw bios as of QEMU 4.1. + +vfio-ccw supports classic (command mode) channel I/O only. Transport +mode (HPF) is not supported. + +QDIO subchannels are currently not supported. Classic devices other than +DASD/ECKD might work, but have not been tested. + +Reference +--------- +1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832) +2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204) +3. https://en.wikipedia.org/wiki/Channel_I/O +4. Documentation/arch/s390/cds.rst +5. Documentation/driver-api/vfio.rst +6. Documentation/driver-api/vfio-mediated-device.rst diff --git a/Documentation/arch/s390/zfcpdump.rst b/Documentation/arch/s390/zfcpdump.rst new file mode 100644 index 000000000000..a61de7aa8778 --- /dev/null +++ b/Documentation/arch/s390/zfcpdump.rst @@ -0,0 +1,50 @@ +================================== +The s390 SCSI dump tool (zfcpdump) +================================== + +System z machines (z900 or higher) provide hardware support for creating system +dumps on SCSI disks. The dump process is initiated by booting a dump tool, which +has to create a dump of the current (probably crashed) Linux image. In order to +not overwrite memory of the crashed Linux with data of the dump tool, the +hardware saves some memory plus the register sets of the boot CPU before the +dump tool is loaded. There exists an SCLP hardware interface to obtain the saved +memory afterwards. Currently 32 MB are saved. + +This zfcpdump implementation consists of a Linux dump kernel together with +a user space dump tool, which are loaded together into the saved memory region +below 32 MB. zfcpdump is installed on a SCSI disk using zipl (as contained in +the s390-tools package) to make the device bootable. The operator of a Linux +system can then trigger a SCSI dump by booting the SCSI disk, where zfcpdump +resides on. + +The user space dump tool accesses the memory of the crashed system by means +of the /proc/vmcore interface. This interface exports the crashed system's +memory and registers in ELF core dump format. To access the memory which has +been saved by the hardware SCLP requests will be created at the time the data +is needed by /proc/vmcore. The tail part of the crashed systems memory which +has not been stashed by hardware can just be copied from real memory. + +To build a dump enabled kernel the kernel config option CONFIG_CRASH_DUMP +has to be set. + +To get a valid zfcpdump kernel configuration use "make zfcpdump_defconfig". + +The s390 zipl tool looks for the zfcpdump kernel and optional initrd/initramfs +under the following locations: + +* kernel: /zfcpdump.image +* ramdisk: /zfcpdump.rd + +The zfcpdump directory is defined in the s390-tools package. + +The user space application of zfcpdump can reside in an intitramfs or an +initrd. It can also be included in a built-in kernel initramfs. The application +reads from /proc/vmcore or zcore/mem and writes the system dump to a SCSI disk. + +The s390-tools package version 1.24.0 and above builds an external zfcpdump +initramfs with a user space application that writes the dump to a SCSI +partition. + +For more information on how to use zfcpdump refer to the s390 'Using the Dump +Tools' book, which is available from IBM Knowledge Center: +https://www.ibm.com/support/knowledgecenter/linuxonibm/liaaf/lnz_r_dt.html diff --git a/Documentation/driver-api/s390-drivers.rst b/Documentation/driver-api/s390-drivers.rst index 5158577bc29b..8c0845c4eee7 100644 --- a/Documentation/driver-api/s390-drivers.rst +++ b/Documentation/driver-api/s390-drivers.rst @@ -27,7 +27,7 @@ not strictly considered I/O devices. They are considered here as well, although they are not the focus of this document. Some additional information can also be found in the kernel source under -Documentation/s390/driver-model.rst. +Documentation/arch/s390/driver-model.rst. The css bus =========== @@ -38,7 +38,7 @@ into several categories: * Standard I/O subchannels, for use by the system. They have a child device on the ccw bus and are described below. * I/O subchannels bound to the vfio-ccw driver. See - Documentation/s390/vfio-ccw.rst. + Documentation/arch/s390/vfio-ccw.rst. * Message subchannels. No Linux driver currently exists. * CHSC subchannels (at most one). The chsc subchannel driver can be used to send asynchronous chsc commands. diff --git a/Documentation/s390/3270.ChangeLog b/Documentation/s390/3270.ChangeLog deleted file mode 100644 index ecaf60b6c381..000000000000 --- a/Documentation/s390/3270.ChangeLog +++ /dev/null @@ -1,44 +0,0 @@ -ChangeLog for the UTS Global 3270-support patch - -Sep 2002: Get bootup colors right on 3270 console - * In tubttybld.c, substantially revise ESC processing so that - ESC sequences (especially coloring ones) and the strings - they affect work as right as 3270 can get them. Also, set - screen height to omit the two rows used for input area, in - tty3270_open() in tubtty.c. - -Sep 2002: Dynamically get 3270 input buffer - * Oversize 3270 screen widths may exceed GEOM_MAXINPLEN columns, - so get input-area buffer dynamically when sizing the device in - tubmakemin() in tuball.c (if it's the console) or tty3270_open() - in tubtty.c (if needed). Change tubp->tty_input to be a - pointer rather than an array, in tubio.h. - -Sep 2002: Fix tubfs kmalloc()s - * Do read and write lengths correctly in fs3270_read() - and fs3270_write(), while never asking kmalloc() - for more than 0x800 bytes. Affects tubfs.c and tubio.h. - -Sep 2002: Recognize 3270 control unit type 3174 - * Recognize control-unit type 0x3174 as well as 0x327?. - The IBM 2047 device emulates a 3174 control unit. - Modularize control-unit recognition in tuball.c by - adding and invoking new tub3270_is_ours(). - -Apr 2002: Fix 3270 console reboot loop - * (Belated log entry) Fixed reboot loop if 3270 console, - in tubtty.c:ttu3270_bh(). - -Feb 6, 2001: - * This changelog is new - * tub3270 now supports 3270 console: - Specify y for CONFIG_3270 and y for CONFIG_3270_CONSOLE. - Support for 3215 will not appear if 3270 console support - is chosen. - NOTE: The default is 3270 console support, NOT 3215. - * the components are remodularized: added source modules are - tubttybld.c and tubttyscl.c, for screen-building code and - scroll-timeout code. - * tub3270 source for this (2.4.0) version is #ifdeffed to - build with both 2.4.0 and 2.2.16.2. - * color support and minimal other ESC-sequence support is added. diff --git a/Documentation/s390/3270.rst b/Documentation/s390/3270.rst deleted file mode 100644 index e09e77954238..000000000000 --- a/Documentation/s390/3270.rst +++ /dev/null @@ -1,298 +0,0 @@ -=============================== -IBM 3270 Display System support -=============================== - -This file describes the driver that supports local channel attachment -of IBM 3270 devices. It consists of three sections: - - * Introduction - * Installation - * Operation - - -Introduction -============ - -This paper describes installing and operating 3270 devices under -Linux/390. A 3270 device is a block-mode rows-and-columns terminal of -which I'm sure hundreds of millions were sold by IBM and clonemakers -twenty and thirty years ago. - -You may have 3270s in-house and not know it. If you're using the -VM-ESA operating system, define a 3270 to your virtual machine by using -the command "DEF GRAF " This paper presumes you will be -defining four 3270s with the CP/CMS commands: - - - DEF GRAF 620 - - DEF GRAF 621 - - DEF GRAF 622 - - DEF GRAF 623 - -Your network connection from VM-ESA allows you to use x3270, tn3270, or -another 3270 emulator, started from an xterm window on your PC or -workstation. With the DEF GRAF command, an application such as xterm, -and this Linux-390 3270 driver, you have another way of talking to your -Linux box. - -This paper covers installation of the driver and operation of a -dialed-in x3270. - - -Installation -============ - -You install the driver by installing a patch, doing a kernel build, and -running the configuration script (config3270.sh, in this directory). - -WARNING: If you are using 3270 console support, you must rerun the -configuration script every time you change the console's address (perhaps -by using the condev= parameter in silo's /boot/parmfile). More precisely, -you should rerun the configuration script every time your set of 3270s, -including the console 3270, changes subchannel identifier relative to -one another. ReIPL as soon as possible after running the configuration -script and the resulting /tmp/mkdev3270. - -If you have chosen to make tub3270 a module, you add a line to a -configuration file under /etc/modprobe.d/. If you are working on a VM -virtual machine, you can use DEF GRAF to define virtual 3270 devices. - -You may generate both 3270 and 3215 console support, or one or the -other, or neither. If you generate both, the console type under VM is -not changed. Use #CP Q TERM to see what the current console type is. -Use #CP TERM CONMODE 3270 to change it to 3270. If you generate only -3270 console support, then the driver automatically converts your console -at boot time to a 3270 if it is a 3215. - -In brief, these are the steps: - - 1. Install the tub3270 patch - 2. (If a module) add a line to a file in `/etc/modprobe.d/*.conf` - 3. (If VM) define devices with DEF GRAF - 4. Reboot - 5. Configure - -To test that everything works, assuming VM and x3270, - - 1. Bring up an x3270 window. - 2. Use the DIAL command in that window. - 3. You should immediately see a Linux login screen. - -Here are the installation steps in detail: - - 1. The 3270 driver is a part of the official Linux kernel - source. Build a tree with the kernel source and any necessary - patches. Then do:: - - make oldconfig - (If you wish to disable 3215 console support, edit - .config; change CONFIG_TN3215's value to "n"; - and rerun "make oldconfig".) - make image - make modules - make modules_install - - 2. (Perform this step only if you have configured tub3270 as a - module.) Add a line to a file `/etc/modprobe.d/*.conf` to automatically - load the driver when it's needed. With this line added, you will see - login prompts appear on your 3270s as soon as boot is complete (or - with emulated 3270s, as soon as you dial into your vm guest using the - command "DIAL "). Since the line-mode major number is - 227, the line to add should be:: - - alias char-major-227 tub3270 - - 3. Define graphic devices to your vm guest machine, if you - haven't already. Define them before you reboot (reipl): - - - DEFINE GRAF 620 - - DEFINE GRAF 621 - - DEFINE GRAF 622 - - DEFINE GRAF 623 - - 4. Reboot. The reboot process scans hardware devices, including - 3270s, and this enables the tub3270 driver once loaded to respond - correctly to the configuration requests of the next step. If - you have chosen 3270 console support, your console now behaves - as a 3270, not a 3215. - - 5. Run the 3270 configuration script config3270. It is - distributed in this same directory, Documentation/s390, as - config3270.sh. Inspect the output script it produces, - /tmp/mkdev3270, and then run that script. This will create the - necessary character special device files and make the necessary - changes to /etc/inittab. - - Then notify /sbin/init that /etc/inittab has changed, by issuing - the telinit command with the q operand:: - - cd Documentation/s390 - sh config3270.sh - sh /tmp/mkdev3270 - telinit q - - This should be sufficient for your first time. If your 3270 - configuration has changed and you're reusing config3270, you - should follow these steps:: - - Change 3270 configuration - Reboot - Run config3270 and /tmp/mkdev3270 - Reboot - -Here are the testing steps in detail: - - 1. Bring up an x3270 window, or use an actual hardware 3278 or - 3279, or use the 3270 emulator of your choice. You would be - running the emulator on your PC or workstation. You would use - the command, for example:: - - x3270 vm-esa-domain-name & - - if you wanted a 3278 Model 4 with 43 rows of 80 columns, the - default model number. The driver does not take advantage of - extended attributes. - - The screen you should now see contains a VM logo with input - lines near the bottom. Use TAB to move to the bottom line, - probably labeled "COMMAND ===>". - - 2. Use the DIAL command instead of the LOGIN command to connect - to one of the virtual 3270s you defined with the DEF GRAF - commands:: - - dial my-vm-guest-name - - 3. You should immediately see a login prompt from your - Linux-390 operating system. If that does not happen, you would - see instead the line "DIALED TO my-vm-guest-name 0620". - - To troubleshoot: do these things. - - A. Is the driver loaded? Use the lsmod command (no operands) - to find out. Probably it isn't. Try loading it manually, with - the command "insmod tub3270". Does that command give error - messages? Ha! There's your problem. - - B. Is the /etc/inittab file modified as in installation step 3 - above? Use the grep command to find out; for instance, issue - "grep 3270 /etc/inittab". Nothing found? There's your - problem! - - C. Are the device special files created, as in installation - step 2 above? Use the ls -l command to find out; for instance, - issue "ls -l /dev/3270/tty620". The output should start with the - letter "c" meaning character device and should contain "227, 1" - just to the left of the device name. No such file? no "c"? - Wrong major number? Wrong minor number? There's your - problem! - - D. Do you get the message:: - - "HCPDIA047E my-vm-guest-name 0620 does not exist"? - - If so, you must issue the command "DEF GRAF 620" from your VM - 3215 console and then reboot the system. - - - -OPERATION. -========== - -The driver defines three areas on the 3270 screen: the log area, the -input area, and the status area. - -The log area takes up all but the bottom two lines of the screen. The -driver writes terminal output to it, starting at the top line and going -down. When it fills, the status area changes from "Linux Running" to -"Linux More...". After a scrolling timeout of (default) 5 sec, the -screen clears and more output is written, from the top down. - -The input area extends from the beginning of the second-to-last screen -line to the start of the status area. You type commands in this area -and hit ENTER to execute them. - -The status area initializes to "Linux Running" to give you a warm -fuzzy feeling. When the log area fills up and output awaits, it -changes to "Linux More...". At this time you can do several things or -nothing. If you do nothing, the screen will clear in (default) 5 sec -and more output will appear. You may hit ENTER with nothing typed in -the input area to toggle between "Linux More..." and "Linux Holding", -which indicates no scrolling will occur. (If you hit ENTER with "Linux -Running" and nothing typed, the application receives a newline.) - -You may change the scrolling timeout value. For example, the following -command line:: - - echo scrolltime=60 > /proc/tty/driver/tty3270 - -changes the scrolling timeout value to 60 sec. Set scrolltime to 0 if -you wish to prevent scrolling entirely. - -Other things you may do when the log area fills up are: hit PA2 to -clear the log area and write more output to it, or hit CLEAR to clear -the log area and the input area and write more output to the log area. - -Some of the Program Function (PF) and Program Attention (PA) keys are -preassigned special functions. The ones that are not yield an alarm -when pressed. - -PA1 causes a SIGINT to the currently running application. You may do -the same thing from the input area, by typing "^C" and hitting ENTER. - -PA2 causes the log area to be cleared. If output awaits, it is then -written to the log area. - -PF3 causes an EOF to be received as input by the application. You may -cause an EOF also by typing "^D" and hitting ENTER. - -No PF key is preassigned to cause a job suspension, but you may cause a -job suspension by typing "^Z" and hitting ENTER. You may wish to -assign this function to a PF key. To make PF7 cause job suspension, -execute the command:: - - echo pf7=^z > /proc/tty/driver/tty3270 - -If the input you type does not end with the two characters "^n", the -driver appends a newline character and sends it to the tty driver; -otherwise the driver strips the "^n" and does not append a newline. -The IBM 3215 driver behaves similarly. - -Pf10 causes the most recent command to be retrieved from the tube's -command stack (default depth 20) and displayed in the input area. You -may hit PF10 again for the next-most-recent command, and so on. A -command is entered into the stack only when the input area is not made -invisible (such as for password entry) and it is not identical to the -current top entry. PF10 rotates backward through the command stack; -PF11 rotates forward. You may assign the backward function to any PF -key (or PA key, for that matter), say, PA3, with the command:: - - echo -e pa3=\\033k > /proc/tty/driver/tty3270 - -This assigns the string ESC-k to PA3. Similarly, the string ESC-j -performs the forward function. (Rationale: In bash with vi-mode line -editing, ESC-k and ESC-j retrieve backward and forward history. -Suggestions welcome.) - -Is a stack size of twenty commands not to your liking? Change it on -the fly. To change to saving the last 100 commands, execute the -command:: - - echo recallsize=100 > /proc/tty/driver/tty3270 - -Have a command you issue frequently? Assign it to a PF or PA key! Use -the command:: - - echo pf24="mkdir foobar; cd foobar" > /proc/tty/driver/tty3270 - -to execute the commands mkdir foobar and cd foobar immediately when you -hit PF24. Want to see the command line first, before you execute it? -Use the -n option of the echo command:: - - echo -n pf24="mkdir foo; cd foo" > /proc/tty/driver/tty3270 - - - -Happy testing! I welcome any and all comments about this document, the -driver, etc etc. - -Dick Hitt diff --git a/Documentation/s390/cds.rst b/Documentation/s390/cds.rst deleted file mode 100644 index 7006d8209d2e..000000000000 --- a/Documentation/s390/cds.rst +++ /dev/null @@ -1,530 +0,0 @@ -=========================== -Linux for S/390 and zSeries -=========================== - -Common Device Support (CDS) -Device Driver I/O Support Routines - -Authors: - - Ingo Adlung - - Cornelia Huck - -Copyright, IBM Corp. 1999-2002 - -Introduction -============ - -This document describes the common device support routines for Linux/390. -Different than other hardware architectures, ESA/390 has defined a unified -I/O access method. This gives relief to the device drivers as they don't -have to deal with different bus types, polling versus interrupt -processing, shared versus non-shared interrupt processing, DMA versus port -I/O (PIO), and other hardware features more. However, this implies that -either every single device driver needs to implement the hardware I/O -attachment functionality itself, or the operating system provides for a -unified method to access the hardware, providing all the functionality that -every single device driver would have to provide itself. - -The document does not intend to explain the ESA/390 hardware architecture in -every detail.This information can be obtained from the ESA/390 Principles of -Operation manual (IBM Form. No. SA22-7201). - -In order to build common device support for ESA/390 I/O interfaces, a -functional layer was introduced that provides generic I/O access methods to -the hardware. - -The common device support layer comprises the I/O support routines defined -below. Some of them implement common Linux device driver interfaces, while -some of them are ESA/390 platform specific. - -Note: - In order to write a driver for S/390, you also need to look into the interface - described in Documentation/s390/driver-model.rst. - -Note for porting drivers from 2.4: - -The major changes are: - -* The functions use a ccw_device instead of an irq (subchannel). -* All drivers must define a ccw_driver (see driver-model.txt) and the associated - functions. -* request_irq() and free_irq() are no longer done by the driver. -* The oper_handler is (kindof) replaced by the probe() and set_online() functions - of the ccw_driver. -* The not_oper_handler is (kindof) replaced by the remove() and set_offline() - functions of the ccw_driver. -* The channel device layer is gone. -* The interrupt handlers must be adapted to use a ccw_device as argument. - Moreover, they don't return a devstat, but an irb. -* Before initiating an io, the options must be set via ccw_device_set_options(). -* Instead of calling read_dev_chars()/read_conf_data(), the driver issues - the channel program and handles the interrupt itself. - -ccw_device_get_ciw() - get commands from extended sense data. - -ccw_device_start(), ccw_device_start_timeout(), ccw_device_start_key(), ccw_device_start_key_timeout() - initiate an I/O request. - -ccw_device_resume() - resume channel program execution. - -ccw_device_halt() - terminate the current I/O request processed on the device. - -do_IRQ() - generic interrupt routine. This function is called by the interrupt entry - routine whenever an I/O interrupt is presented to the system. The do_IRQ() - routine determines the interrupt status and calls the device specific - interrupt handler according to the rules (flags) defined during I/O request - initiation with do_IO(). - -The next chapters describe the functions other than do_IRQ() in more details. -The do_IRQ() interface is not described, as it is called from the Linux/390 -first level interrupt handler only and does not comprise a device driver -callable interface. Instead, the functional description of do_IO() also -describes the input to the device specific interrupt handler. - -Note: - All explanations apply also to the 64 bit architecture s390x. - - -Common Device Support (CDS) for Linux/390 Device Drivers -======================================================== - -General Information -------------------- - -The following chapters describe the I/O related interface routines the -Linux/390 common device support (CDS) provides to allow for device specific -driver implementations on the IBM ESA/390 hardware platform. Those interfaces -intend to provide the functionality required by every device driver -implementation to allow to drive a specific hardware device on the ESA/390 -platform. Some of the interface routines are specific to Linux/390 and some -of them can be found on other Linux platforms implementations too. -Miscellaneous function prototypes, data declarations, and macro definitions -can be found in the architecture specific C header file -linux/arch/s390/include/asm/irq.h. - -Overview of CDS interface concepts ----------------------------------- - -Different to other hardware platforms, the ESA/390 architecture doesn't define -interrupt lines managed by a specific interrupt controller and bus systems -that may or may not allow for shared interrupts, DMA processing, etc.. Instead, -the ESA/390 architecture has implemented a so called channel subsystem, that -provides a unified view of the devices physically attached to the systems. -Though the ESA/390 hardware platform knows about a huge variety of different -peripheral attachments like disk devices (aka. DASDs), tapes, communication -controllers, etc. they can all be accessed by a well defined access method and -they are presenting I/O completion a unified way : I/O interruptions. Every -single device is uniquely identified to the system by a so called subchannel, -where the ESA/390 architecture allows for 64k devices be attached. - -Linux, however, was first built on the Intel PC architecture, with its two -cascaded 8259 programmable interrupt controllers (PICs), that allow for a -maximum of 15 different interrupt lines. All devices attached to such a system -share those 15 interrupt levels. Devices attached to the ISA bus system must -not share interrupt levels (aka. IRQs), as the ISA bus bases on edge triggered -interrupts. MCA, EISA, PCI and other bus systems base on level triggered -interrupts, and therewith allow for shared IRQs. However, if multiple devices -present their hardware status by the same (shared) IRQ, the operating system -has to call every single device driver registered on this IRQ in order to -determine the device driver owning the device that raised the interrupt. - -Up to kernel 2.4, Linux/390 used to provide interfaces via the IRQ (subchannel). -For internal use of the common I/O layer, these are still there. However, -device drivers should use the new calling interface via the ccw_device only. - -During its startup the Linux/390 system checks for peripheral devices. Each -of those devices is uniquely defined by a so called subchannel by the ESA/390 -channel subsystem. While the subchannel numbers are system generated, each -subchannel also takes a user defined attribute, the so called device number. -Both subchannel number and device number cannot exceed 65535. During sysfs -initialisation, the information about control unit type and device types that -imply specific I/O commands (channel command words - CCWs) in order to operate -the device are gathered. Device drivers can retrieve this set of hardware -information during their initialization step to recognize the devices they -support using the information saved in the struct ccw_device given to them. -This methods implies that Linux/390 doesn't require to probe for free (not -armed) interrupt request lines (IRQs) to drive its devices with. Where -applicable, the device drivers can use issue the READ DEVICE CHARACTERISTICS -ccw to retrieve device characteristics in its online routine. - -In order to allow for easy I/O initiation the CDS layer provides a -ccw_device_start() interface that takes a device specific channel program (one -or more CCWs) as input sets up the required architecture specific control blocks -and initiates an I/O request on behalf of the device driver. The -ccw_device_start() routine allows to specify whether it expects the CDS layer -to notify the device driver for every interrupt it observes, or with final status -only. See ccw_device_start() for more details. A device driver must never issue -ESA/390 I/O commands itself, but must use the Linux/390 CDS interfaces instead. - -For long running I/O request to be canceled, the CDS layer provides the -ccw_device_halt() function. Some devices require to initially issue a HALT -SUBCHANNEL (HSCH) command without having pending I/O requests. This function is -also covered by ccw_device_halt(). - - -get_ciw() - get command information word - -This call enables a device driver to get information about supported commands -from the extended SenseID data. - -:: - - struct ciw * - ccw_device_get_ciw(struct ccw_device *cdev, __u32 cmd); - -==== ======================================================== -cdev The ccw_device for which the command is to be retrieved. -cmd The command type to be retrieved. -==== ======================================================== - -ccw_device_get_ciw() returns: - -===== ================================================================ - NULL No extended data available, invalid device or command not found. -!NULL The command requested. -===== ================================================================ - -:: - - ccw_device_start() - Initiate I/O Request - -The ccw_device_start() routines is the I/O request front-end processor. All -device driver I/O requests must be issued using this routine. A device driver -must not issue ESA/390 I/O commands itself. Instead the ccw_device_start() -routine provides all interfaces required to drive arbitrary devices. - -This description also covers the status information passed to the device -driver's interrupt handler as this is related to the rules (flags) defined -with the associated I/O request when calling ccw_device_start(). - -:: - - int ccw_device_start(struct ccw_device *cdev, - struct ccw1 *cpa, - unsigned long intparm, - __u8 lpm, - unsigned long flags); - int ccw_device_start_timeout(struct ccw_device *cdev, - struct ccw1 *cpa, - unsigned long intparm, - __u8 lpm, - unsigned long flags, - int expires); - int ccw_device_start_key(struct ccw_device *cdev, - struct ccw1 *cpa, - unsigned long intparm, - __u8 lpm, - __u8 key, - unsigned long flags); - int ccw_device_start_key_timeout(struct ccw_device *cdev, - struct ccw1 *cpa, - unsigned long intparm, - __u8 lpm, - __u8 key, - unsigned long flags, - int expires); - -============= ============================================================= -cdev ccw_device the I/O is destined for -cpa logical start address of channel program -user_intparm user specific interrupt information; will be presented - back to the device driver's interrupt handler. Allows a - device driver to associate the interrupt with a - particular I/O request. -lpm defines the channel path to be used for a specific I/O - request. A value of 0 will make cio use the opm. -key the storage key to use for the I/O (useful for operating on a - storage with a storage key != default key) -flag defines the action to be performed for I/O processing -expires timeout value in jiffies. The common I/O layer will terminate - the running program after this and call the interrupt handler - with ERR_PTR(-ETIMEDOUT) as irb. -============= ============================================================= - -Possible flag values are: - -========================= ============================================= -DOIO_ALLOW_SUSPEND channel program may become suspended -DOIO_DENY_PREFETCH don't allow for CCW prefetch; usually - this implies the channel program might - become modified -DOIO_SUPPRESS_INTER don't call the handler on intermediate status -========================= ============================================= - -The cpa parameter points to the first format 1 CCW of a channel program:: - - struct ccw1 { - __u8 cmd_code;/* command code */ - __u8 flags; /* flags, like IDA addressing, etc. */ - __u16 count; /* byte count */ - __u32 cda; /* data address */ - } __attribute__ ((packed,aligned(8))); - -with the following CCW flags values defined: - -=================== ========================= -CCW_FLAG_DC data chaining -CCW_FLAG_CC command chaining -CCW_FLAG_SLI suppress incorrect length -CCW_FLAG_SKIP skip -CCW_FLAG_PCI PCI -CCW_FLAG_IDA indirect addressing -CCW_FLAG_SUSPEND suspend -=================== ========================= - - -Via ccw_device_set_options(), the device driver may specify the following -options for the device: - -========================= ====================================== -DOIO_EARLY_NOTIFICATION allow for early interrupt notification -DOIO_REPORT_ALL report all interrupt conditions -========================= ====================================== - - -The ccw_device_start() function returns: - -======== ====================================================================== - 0 successful completion or request successfully initiated - -EBUSY The device is currently processing a previous I/O request, or there is - a status pending at the device. --ENODEV cdev is invalid, the device is not operational or the ccw_device is - not online. -======== ====================================================================== - -When the I/O request completes, the CDS first level interrupt handler will -accumulate the status in a struct irb and then call the device interrupt handler. -The intparm field will contain the value the device driver has associated with a -particular I/O request. If a pending device status was recognized, -intparm will be set to 0 (zero). This may happen during I/O initiation or delayed -by an alert status notification. In any case this status is not related to the -current (last) I/O request. In case of a delayed status notification no special -interrupt will be presented to indicate I/O completion as the I/O request was -never started, even though ccw_device_start() returned with successful completion. - -The irb may contain an error value, and the device driver should check for this -first: - -========== ================================================================= --ETIMEDOUT the common I/O layer terminated the request after the specified - timeout value --EIO the common I/O layer terminated the request due to an error state -========== ================================================================= - -If the concurrent sense flag in the extended status word (esw) in the irb is -set, the field erw.scnt in the esw describes the number of device specific -sense bytes available in the extended control word irb->scsw.ecw[]. No device -sensing by the device driver itself is required. - -The device interrupt handler can use the following definitions to investigate -the primary unit check source coded in sense byte 0 : - -======================= ==== -SNS0_CMD_REJECT 0x80 -SNS0_INTERVENTION_REQ 0x40 -SNS0_BUS_OUT_CHECK 0x20 -SNS0_EQUIPMENT_CHECK 0x10 -SNS0_DATA_CHECK 0x08 -SNS0_OVERRUN 0x04 -SNS0_INCOMPL_DOMAIN 0x01 -======================= ==== - -Depending on the device status, multiple of those values may be set together. -Please refer to the device specific documentation for details. - -The irb->scsw.cstat field provides the (accumulated) subchannel status : - -========================= ============================ -SCHN_STAT_PCI program controlled interrupt -SCHN_STAT_INCORR_LEN incorrect length -SCHN_STAT_PROG_CHECK program check -SCHN_STAT_PROT_CHECK protection check -SCHN_STAT_CHN_DATA_CHK channel data check -SCHN_STAT_CHN_CTRL_CHK channel control check -SCHN_STAT_INTF_CTRL_CHK interface control check -SCHN_STAT_CHAIN_CHECK chaining check -========================= ============================ - -The irb->scsw.dstat field provides the (accumulated) device status : - -===================== ================= -DEV_STAT_ATTENTION attention -DEV_STAT_STAT_MOD status modifier -DEV_STAT_CU_END control unit end -DEV_STAT_BUSY busy -DEV_STAT_CHN_END channel end -DEV_STAT_DEV_END device end -DEV_STAT_UNIT_CHECK unit check -DEV_STAT_UNIT_EXCEP unit exception -===================== ================= - -Please see the ESA/390 Principles of Operation manual for details on the -individual flag meanings. - -Usage Notes: - -ccw_device_start() must be called disabled and with the ccw device lock held. - -The device driver is allowed to issue the next ccw_device_start() call from -within its interrupt handler already. It is not required to schedule a -bottom-half, unless a non deterministically long running error recovery procedure -or similar needs to be scheduled. During I/O processing the Linux/390 generic -I/O device driver support has already obtained the IRQ lock, i.e. the handler -must not try to obtain it again when calling ccw_device_start() or we end in a -deadlock situation! - -If a device driver relies on an I/O request to be completed prior to start the -next it can reduce I/O processing overhead by chaining a NoOp I/O command -CCW_CMD_NOOP to the end of the submitted CCW chain. This will force Channel-End -and Device-End status to be presented together, with a single interrupt. -However, this should be used with care as it implies the channel will remain -busy, not being able to process I/O requests for other devices on the same -channel. Therefore e.g. read commands should never use this technique, as the -result will be presented by a single interrupt anyway. - -In order to minimize I/O overhead, a device driver should use the -DOIO_REPORT_ALL only if the device can report intermediate interrupt -information prior to device-end the device driver urgently relies on. In this -case all I/O interruptions are presented to the device driver until final -status is recognized. - -If a device is able to recover from asynchronously presented I/O errors, it can -perform overlapping I/O using the DOIO_EARLY_NOTIFICATION flag. While some -devices always report channel-end and device-end together, with a single -interrupt, others present primary status (channel-end) when the channel is -ready for the next I/O request and secondary status (device-end) when the data -transmission has been completed at the device. - -Above flag allows to exploit this feature, e.g. for communication devices that -can handle lost data on the network to allow for enhanced I/O processing. - -Unless the channel subsystem at any time presents a secondary status interrupt, -exploiting this feature will cause only primary status interrupts to be -presented to the device driver while overlapping I/O is performed. When a -secondary status without error (alert status) is presented, this indicates -successful completion for all overlapping ccw_device_start() requests that have -been issued since the last secondary (final) status. - -Channel programs that intend to set the suspend flag on a channel command word -(CCW) must start the I/O operation with the DOIO_ALLOW_SUSPEND option or the -suspend flag will cause a channel program check. At the time the channel program -becomes suspended an intermediate interrupt will be generated by the channel -subsystem. - -ccw_device_resume() - Resume Channel Program Execution - -If a device driver chooses to suspend the current channel program execution by -setting the CCW suspend flag on a particular CCW, the channel program execution -is suspended. In order to resume channel program execution the CIO layer -provides the ccw_device_resume() routine. - -:: - - int ccw_device_resume(struct ccw_device *cdev); - -==== ================================================ -cdev ccw_device the resume operation is requested for -==== ================================================ - -The ccw_device_resume() function returns: - -========= ============================================== - 0 suspended channel program is resumed - -EBUSY status pending - -ENODEV cdev invalid or not-operational subchannel - -EINVAL resume function not applicable --ENOTCONN there is no I/O request pending for completion -========= ============================================== - -Usage Notes: - -Please have a look at the ccw_device_start() usage notes for more details on -suspended channel programs. - -ccw_device_halt() - Halt I/O Request Processing - -Sometimes a device driver might need a possibility to stop the processing of -a long-running channel program or the device might require to initially issue -a halt subchannel (HSCH) I/O command. For those purposes the ccw_device_halt() -command is provided. - -ccw_device_halt() must be called disabled and with the ccw device lock held. - -:: - - int ccw_device_halt(struct ccw_device *cdev, - unsigned long intparm); - -======= ===================================================== -cdev ccw_device the halt operation is requested for -intparm interruption parameter; value is only used if no I/O - is outstanding, otherwise the intparm associated with - the I/O request is returned -======= ===================================================== - -The ccw_device_halt() function returns: - -======= ============================================================== - 0 request successfully initiated --EBUSY the device is currently busy, or status pending. --ENODEV cdev invalid. --EINVAL The device is not operational or the ccw device is not online. -======= ============================================================== - -Usage Notes: - -A device driver may write a never-ending channel program by writing a channel -program that at its end loops back to its beginning by means of a transfer in -channel (TIC) command (CCW_CMD_TIC). Usually this is performed by network -device drivers by setting the PCI CCW flag (CCW_FLAG_PCI). Once this CCW is -executed a program controlled interrupt (PCI) is generated. The device driver -can then perform an appropriate action. Prior to interrupt of an outstanding -read to a network device (with or without PCI flag) a ccw_device_halt() -is required to end the pending operation. - -:: - - ccw_device_clear() - Terminage I/O Request Processing - -In order to terminate all I/O processing at the subchannel, the clear subchannel -(CSCH) command is used. It can be issued via ccw_device_clear(). - -ccw_device_clear() must be called disabled and with the ccw device lock held. - -:: - - int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm); - -======= =============================================== -cdev ccw_device the clear operation is requested for -intparm interruption parameter (see ccw_device_halt()) -======= =============================================== - -The ccw_device_clear() function returns: - -======= ============================================================== - 0 request successfully initiated --ENODEV cdev invalid --EINVAL The device is not operational or the ccw device is not online. -======= ============================================================== - -Miscellaneous Support Routines ------------------------------- - -This chapter describes various routines to be used in a Linux/390 device -driver programming environment. - -get_ccwdev_lock() - -Get the address of the device specific lock. This is then used in -spin_lock() / spin_unlock() calls. - -:: - - __u8 ccw_device_get_path_mask(struct ccw_device *cdev); - -Get the mask of the path currently available for cdev. diff --git a/Documentation/s390/common_io.rst b/Documentation/s390/common_io.rst deleted file mode 100644 index 846485681ce7..000000000000 --- a/Documentation/s390/common_io.rst +++ /dev/null @@ -1,140 +0,0 @@ -====================== -S/390 common I/O-Layer -====================== - -command line parameters, procfs and debugfs entries -=================================================== - -Command line parameters ------------------------ - -* ccw_timeout_log - - Enable logging of debug information in case of ccw device timeouts. - -* cio_ignore = device[,device[,..]] - - device := {all | [!]ipldev | [!]condev | [!] | [!]-} - - The given devices will be ignored by the common I/O-layer; no detection - and device sensing will be done on any of those devices. The subchannel to - which the device in question is attached will be treated as if no device was - attached. - - An ignored device can be un-ignored later; see the "/proc entries"-section for - details. - - The devices must be given either as bus ids (0.x.abcd) or as hexadecimal - device numbers (0xabcd or abcd, for 2.4 backward compatibility). If you - give a device number 0xabcd, it will be interpreted as 0.0.abcd. - - You can use the 'all' keyword to ignore all devices. The 'ipldev' and 'condev' - keywords can be used to refer to the CCW based boot device and CCW console - device respectively (these are probably useful only when combined with the '!' - operator). The '!' operator will cause the I/O-layer to _not_ ignore a device. - The command line - is parsed from left to right. - - For example:: - - cio_ignore=0.0.0023-0.0.0042,0.0.4711 - - will ignore all devices ranging from 0.0.0023 to 0.0.0042 and the device - 0.0.4711, if detected. - - As another example:: - - cio_ignore=all,!0.0.4711,!0.0.fd00-0.0.fd02 - - will ignore all devices but 0.0.4711, 0.0.fd00, 0.0.fd01, 0.0.fd02. - - By default, no devices are ignored. - - -/proc entries -------------- - -* /proc/cio_ignore - - Lists the ranges of devices (by bus id) which are ignored by common I/O. - - You can un-ignore certain or all devices by piping to /proc/cio_ignore. - "free all" will un-ignore all ignored devices, - "free , , ..." will un-ignore the specified - devices. - - For example, if devices 0.0.0023 to 0.0.0042 and 0.0.4711 are ignored, - - - echo free 0.0.0030-0.0.0032 > /proc/cio_ignore - will un-ignore devices 0.0.0030 to 0.0.0032 and will leave devices 0.0.0023 - to 0.0.002f, 0.0.0033 to 0.0.0042 and 0.0.4711 ignored; - - echo free 0.0.0041 > /proc/cio_ignore will furthermore un-ignore device - 0.0.0041; - - echo free all > /proc/cio_ignore will un-ignore all remaining ignored - devices. - - When a device is un-ignored, device recognition and sensing is performed and - the device driver will be notified if possible, so the device will become - available to the system. Note that un-ignoring is performed asynchronously. - - You can also add ranges of devices to be ignored by piping to - /proc/cio_ignore; "add , , ..." will ignore the - specified devices. - - Note: While already known devices can be added to the list of devices to be - ignored, there will be no effect on then. However, if such a device - disappears and then reappears, it will then be ignored. To make - known devices go away, you need the "purge" command (see below). - - For example:: - - "echo add 0.0.a000-0.0.accc, 0.0.af00-0.0.afff > /proc/cio_ignore" - - will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored - devices. - - You can remove already known but now ignored devices via:: - - "echo purge > /proc/cio_ignore" - - All devices ignored but still registered and not online (= not in use) - will be deregistered and thus removed from the system. - - The devices can be specified either by bus id (0.x.abcd) or, for 2.4 backward - compatibility, by the device number in hexadecimal (0xabcd or abcd). Device - numbers given as 0xabcd will be interpreted as 0.0.abcd. - -* /proc/cio_settle - - A write request to this file is blocked until all queued cio actions are - handled. This will allow userspace to wait for pending work affecting - device availability after changing cio_ignore or the hardware configuration. - -* For some of the information present in the /proc filesystem in 2.4 (namely, - /proc/subchannels and /proc/chpids), see driver-model.txt. - Information formerly in /proc/irq_count is now in /proc/interrupts. - - -debugfs entries ---------------- - -* /sys/kernel/debug/s390dbf/cio_*/ (S/390 debug feature) - - Some views generated by the debug feature to hold various debug outputs. - - - /sys/kernel/debug/s390dbf/cio_crw/sprintf - Messages from the processing of pending channel report words (machine check - handling). - - - /sys/kernel/debug/s390dbf/cio_msg/sprintf - Various debug messages from the common I/O-layer. - - - /sys/kernel/debug/s390dbf/cio_trace/hex_ascii - Logs the calling of functions in the common I/O-layer and, if applicable, - which subchannel they were called for, as well as dumps of some data - structures (like irb in an error case). - - The level of logging can be changed to be more or less verbose by piping to - /sys/kernel/debug/s390dbf/cio_*/level a number between 0 and 6; see the - documentation on the S/390 debug feature (Documentation/s390/s390dbf.rst) - for details. diff --git a/Documentation/s390/config3270.sh b/Documentation/s390/config3270.sh deleted file mode 100644 index 515e2f431487..000000000000 --- a/Documentation/s390/config3270.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/sh -# -# config3270 -- Autoconfigure /dev/3270/* and /etc/inittab -# -# Usage: -# config3270 -# -# Output: -# /tmp/mkdev3270 -# -# Operation: -# 1. Run this script -# 2. Run the script it produces: /tmp/mkdev3270 -# 3. Issue "telinit q" or reboot, as appropriate. -# -P=/proc/tty/driver/tty3270 -ROOT= -D=$ROOT/dev -SUBD=3270 -TTY=$SUBD/tty -TUB=$SUBD/tub -SCR=$ROOT/tmp/mkdev3270 -SCRTMP=$SCR.a -GETTYLINE=:2345:respawn:/sbin/mingetty -INITTAB=$ROOT/etc/inittab -NINITTAB=$ROOT/etc/NEWinittab -OINITTAB=$ROOT/etc/OLDinittab -ADDNOTE=\\"# Additional mingettys for the 3270/tty* driver, tub3270 ---\\" - -if ! ls $P > /dev/null 2>&1; then - modprobe tub3270 > /dev/null 2>&1 -fi -ls $P > /dev/null 2>&1 || exit 1 - -# Initialize two files, one for /dev/3270 commands and one -# to replace the /etc/inittab file (old one saved in OLDinittab) -echo "#!/bin/sh" > $SCR || exit 1 -echo " " >> $SCR -echo "# Script built by /sbin/config3270" >> $SCR -if [ ! -d /dev/dasd ]; then - echo rm -rf "$D/$SUBD/*" >> $SCR -fi -echo "grep -v $TTY $INITTAB > $NINITTAB" > $SCRTMP || exit 1 -echo "echo $ADDNOTE >> $NINITTAB" >> $SCRTMP -if [ ! -d /dev/dasd ]; then - echo mkdir -p $D/$SUBD >> $SCR -fi - -# Now query the tub3270 driver for 3270 device information -# and add appropriate mknod and mingetty lines to our files -echo what=config > $P -while read devno maj min;do - if [ $min = 0 ]; then - fsmaj=$maj - if [ ! -d /dev/dasd ]; then - echo mknod $D/$TUB c $fsmaj 0 >> $SCR - echo chmod 666 $D/$TUB >> $SCR - fi - elif [ $maj = CONSOLE ]; then - if [ ! -d /dev/dasd ]; then - echo mknod $D/$TUB$devno c $fsmaj $min >> $SCR - fi - else - if [ ! -d /dev/dasd ]; then - echo mknod $D/$TTY$devno c $maj $min >>$SCR - echo mknod $D/$TUB$devno c $fsmaj $min >> $SCR - fi - echo "echo t$min$GETTYLINE $TTY$devno >> $NINITTAB" >> $SCRTMP - fi -done < $P - -echo mv $INITTAB $OINITTAB >> $SCRTMP || exit 1 -echo mv $NINITTAB $INITTAB >> $SCRTMP -cat $SCRTMP >> $SCR -rm $SCRTMP -exit 0 diff --git a/Documentation/s390/driver-model.rst b/Documentation/s390/driver-model.rst deleted file mode 100644 index ad4bc2dbea43..000000000000 --- a/Documentation/s390/driver-model.rst +++ /dev/null @@ -1,328 +0,0 @@ -============================= -S/390 driver model interfaces -============================= - -1. CCW devices --------------- - -All devices which can be addressed by means of ccws are called 'CCW devices' - -even if they aren't actually driven by ccws. - -All ccw devices are accessed via a subchannel, this is reflected in the -structures under devices/:: - - devices/ - - system/ - - css0/ - - 0.0.0000/0.0.0815/ - - 0.0.0001/0.0.4711/ - - 0.0.0002/ - - 0.1.0000/0.1.1234/ - ... - - defunct/ - -In this example, device 0815 is accessed via subchannel 0 in subchannel set 0, -device 4711 via subchannel 1 in subchannel set 0, and subchannel 2 is a non-I/O -subchannel. Device 1234 is accessed via subchannel 0 in subchannel set 1. - -The subchannel named 'defunct' does not represent any real subchannel on the -system; it is a pseudo subchannel where disconnected ccw devices are moved to -if they are displaced by another ccw device becoming operational on their -former subchannel. The ccw devices will be moved again to a proper subchannel -if they become operational again on that subchannel. - -You should address a ccw device via its bus id (e.g. 0.0.4711); the device can -be found under bus/ccw/devices/. - -All ccw devices export some data via sysfs. - -cutype: - The control unit type / model. - -devtype: - The device type / model, if applicable. - -availability: - Can be 'good' or 'boxed'; 'no path' or 'no device' for - disconnected devices. - -online: - An interface to set the device online and offline. - In the special case of the device being disconnected (see the - notify function under 1.2), piping 0 to online will forcibly delete - the device. - -The device drivers can add entries to export per-device data and interfaces. - -There is also some data exported on a per-subchannel basis (see under -bus/css/devices/): - -chpids: - Via which chpids the device is connected. - -pimpampom: - The path installed, path available and path operational masks. - -There also might be additional data, for example for block devices. - - -1.1 Bringing up a ccw device ----------------------------- - -This is done in several steps. - -a. Each driver can provide one or more parameter interfaces where parameters can - be specified. These interfaces are also in the driver's responsibility. -b. After a. has been performed, if necessary, the device is finally brought up - via the 'online' interface. - - -1.2 Writing a driver for ccw devices ------------------------------------- - -The basic struct ccw_device and struct ccw_driver data structures can be found -under include/asm/ccwdev.h:: - - struct ccw_device { - spinlock_t *ccwlock; - struct ccw_device_private *private; - struct ccw_device_id id; - - struct ccw_driver *drv; - struct device dev; - int online; - - void (*handler) (struct ccw_device *dev, unsigned long intparm, - struct irb *irb); - }; - - struct ccw_driver { - struct module *owner; - struct ccw_device_id *ids; - int (*probe) (struct ccw_device *); - int (*remove) (struct ccw_device *); - int (*set_online) (struct ccw_device *); - int (*set_offline) (struct ccw_device *); - int (*notify) (struct ccw_device *, int); - struct device_driver driver; - char *name; - }; - -The 'private' field contains data needed for internal i/o operation only, and -is not available to the device driver. - -Each driver should declare in a MODULE_DEVICE_TABLE into which CU types/models -and/or device types/models it is interested. This information can later be found -in the struct ccw_device_id fields:: - - struct ccw_device_id { - __u16 match_flags; - - __u16 cu_type; - __u16 dev_type; - __u8 cu_model; - __u8 dev_model; - - unsigned long driver_info; - }; - -The functions in ccw_driver should be used in the following way: - -probe: - This function is called by the device layer for each device the driver - is interested in. The driver should only allocate private structures - to put in dev->driver_data and create attributes (if needed). Also, - the interrupt handler (see below) should be set here. - -:: - - int (*probe) (struct ccw_device *cdev); - -Parameters: - cdev - - the device to be probed. - - -remove: - This function is called by the device layer upon removal of the driver, - the device or the module. The driver should perform cleanups here. - -:: - - int (*remove) (struct ccw_device *cdev); - -Parameters: - cdev - - the device to be removed. - - -set_online: - This function is called by the common I/O layer when the device is - activated via the 'online' attribute. The driver should finally - setup and activate the device here. - -:: - - int (*set_online) (struct ccw_device *); - -Parameters: - cdev - - the device to be activated. The common layer has - verified that the device is not already online. - - -set_offline: This function is called by the common I/O layer when the device is - de-activated via the 'online' attribute. The driver should shut - down the device, but not de-allocate its private data. - -:: - - int (*set_offline) (struct ccw_device *); - -Parameters: - cdev - - the device to be deactivated. The common layer has - verified that the device is online. - - -notify: - This function is called by the common I/O layer for some state changes - of the device. - - Signalled to the driver are: - - * In online state, device detached (CIO_GONE) or last path gone - (CIO_NO_PATH). The driver must return !0 to keep the device; for - return code 0, the device will be deleted as usual (also when no - notify function is registered). If the driver wants to keep the - device, it is moved into disconnected state. - * In disconnected state, device operational again (CIO_OPER). The - common I/O layer performs some sanity checks on device number and - Device / CU to be reasonably sure if it is still the same device. - If not, the old device is removed and a new one registered. By the - return code of the notify function the device driver signals if it - wants the device back: !0 for keeping, 0 to make the device being - removed and re-registered. - -:: - - int (*notify) (struct ccw_device *, int); - -Parameters: - cdev - - the device whose state changed. - - event - - the event that happened. This can be one of CIO_GONE, - CIO_NO_PATH or CIO_OPER. - -The handler field of the struct ccw_device is meant to be set to the interrupt -handler for the device. In order to accommodate drivers which use several -distinct handlers (e.g. multi subchannel devices), this is a member of ccw_device -instead of ccw_driver. -The handler is registered with the common layer during set_online() processing -before the driver is called, and is deregistered during set_offline() after the -driver has been called. Also, after registering / before deregistering, path -grouping resp. disbanding of the path group (if applicable) are performed. - -:: - - void (*handler) (struct ccw_device *dev, unsigned long intparm, struct irb *irb); - -Parameters: dev - the device the handler is called for - intparm - the intparm which allows the device driver to identify - the i/o the interrupt is associated with, or to recognize - the interrupt as unsolicited. - irb - interruption response block which contains the accumulated - status. - -The device driver is called from the common ccw_device layer and can retrieve -information about the interrupt from the irb parameter. - - -1.3 ccwgroup devices --------------------- - -The ccwgroup mechanism is designed to handle devices consisting of multiple ccw -devices, like lcs or ctc. - -The ccw driver provides a 'group' attribute. Piping bus ids of ccw devices to -this attributes creates a ccwgroup device consisting of these ccw devices (if -possible). This ccwgroup device can be set online or offline just like a normal -ccw device. - -Each ccwgroup device also provides an 'ungroup' attribute to destroy the device -again (only when offline). This is a generic ccwgroup mechanism (the driver does -not need to implement anything beyond normal removal routines). - -A ccw device which is a member of a ccwgroup device carries a pointer to the -ccwgroup device in the driver_data of its device struct. This field must not be -touched by the driver - it should use the ccwgroup device's driver_data for its -private data. - -To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in -mind that most drivers will need to implement both a ccwgroup and a ccw -driver. - - -2. Channel paths ------------------ - -Channel paths show up, like subchannels, under the channel subsystem root (css0) -and are called 'chp0.'. They have no driver and do not belong to any bus. -Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect -only the logical state and not the physical state, since we cannot track the -latter consistently due to lacking machine support (we don't need to be aware -of it anyway). - -status - - Can be 'online' or 'offline'. - Piping 'on' or 'off' sets the chpid logically online/offline. - Piping 'on' to an online chpid triggers path reprobing for all devices - the chpid connects to. This can be used to force the kernel to re-use - a channel path the user knows to be online, but the machine hasn't - created a machine check for. - -type - - The physical type of the channel path. - -shared - - Whether the channel path is shared. - -cmg - - The channel measurement group. - -3. System devices ------------------ - -3.1 xpram ---------- - -xpram shows up under devices/system/ as 'xpram'. - -3.2 cpus --------- - -For each cpu, a directory is created under devices/system/cpu/. Each cpu has an -attribute 'online' which can be 0 or 1. - - -4. Other devices ----------------- - -4.1 Netiucv ------------ - -The netiucv driver creates an attribute 'connection' under -bus/iucv/drivers/netiucv. Piping to this attribute creates a new netiucv -connection to the specified host. - -Netiucv connections show up under devices/iucv/ as "netiucv". The interface -number is assigned sequentially to the connections defined via the 'connection' -attribute. - -user - - shows the connection partner. - -buffer - - maximum buffer size. Pipe to it to change buffer size. diff --git a/Documentation/s390/features.rst b/Documentation/s390/features.rst deleted file mode 100644 index 57c296a9d8f3..000000000000 --- a/Documentation/s390/features.rst +++ /dev/null @@ -1,3 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. kernel-feat:: $srctree/Documentation/features s390 diff --git a/Documentation/s390/index.rst b/Documentation/s390/index.rst deleted file mode 100644 index 73c79bf586fd..000000000000 --- a/Documentation/s390/index.rst +++ /dev/null @@ -1,30 +0,0 @@ -================= -s390 Architecture -================= - -.. toctree:: - :maxdepth: 1 - - cds - 3270 - driver-model - monreader - qeth - s390dbf - vfio-ap - vfio-ap-locking - vfio-ccw - zfcpdump - common_io - pci - - text_files - - features - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/s390/monreader.rst b/Documentation/s390/monreader.rst deleted file mode 100644 index 21cdfb699b49..000000000000 --- a/Documentation/s390/monreader.rst +++ /dev/null @@ -1,212 +0,0 @@ -================================================= -Linux API for read access to z/VM Monitor Records -================================================= - -Date : 2004-Nov-26 - -Author: Gerald Schaefer (geraldsc@de.ibm.com) - - - - -Description -=========== -This item delivers a new Linux API in the form of a misc char device that is -usable from user space and allows read access to the z/VM Monitor Records -collected by the `*MONITOR` System Service of z/VM. - - -User Requirements -================= -The z/VM guest on which you want to access this API needs to be configured in -order to allow IUCV connections to the `*MONITOR` service, i.e. it needs the -IUCV `*MONITOR` statement in its user entry. If the monitor DCSS to be used is -restricted (likely), you also need the NAMESAVE statement. -This item will use the IUCV device driver to access the z/VM services, so you -need a kernel with IUCV support. You also need z/VM version 4.4 or 5.1. - -There are two options for being able to load the monitor DCSS (examples assume -that the monitor DCSS begins at 144 MB and ends at 152 MB). You can query the -location of the monitor DCSS with the Class E privileged CP command Q NSS MAP -(the values BEGPAG and ENDPAG are given in units of 4K pages). - -See also "CP Command and Utility Reference" (SC24-6081-00) for more information -on the DEF STOR and Q NSS MAP commands, as well as "Saved Segments Planning -and Administration" (SC24-6116-00) for more information on DCSSes. - -1st option: ------------ -You can use the CP command DEF STOR CONFIG to define a "memory hole" in your -guest virtual storage around the address range of the DCSS. - -Example: DEF STOR CONFIG 0.140M 200M.200M - -This defines two blocks of storage, the first is 140MB in size an begins at -address 0MB, the second is 200MB in size and begins at address 200MB, -resulting in a total storage of 340MB. Note that the first block should -always start at 0 and be at least 64MB in size. - -2nd option: ------------ -Your guest virtual storage has to end below the starting address of the DCSS -and you have to specify the "mem=" kernel parameter in your parmfile with a -value greater than the ending address of the DCSS. - -Example:: - - DEF STOR 140M - -This defines 140MB storage size for your guest, the parameter "mem=160M" is -added to the parmfile. - - -User Interface -============== -The char device is implemented as a kernel module named "monreader", -which can be loaded via the modprobe command, or it can be compiled into the -kernel instead. There is one optional module (or kernel) parameter, "mondcss", -to specify the name of the monitor DCSS. If the module is compiled into the -kernel, the kernel parameter "monreader.mondcss=" can be specified -in the parmfile. - -The default name for the DCSS is "MONDCSS" if none is specified. In case that -there are other users already connected to the `*MONITOR` service (e.g. -Performance Toolkit), the monitor DCSS is already defined and you have to use -the same DCSS. The CP command Q MONITOR (Class E privileged) shows the name -of the monitor DCSS, if already defined, and the users connected to the -`*MONITOR` service. -Refer to the "z/VM Performance" book (SC24-6109-00) on how to create a monitor -DCSS if your z/VM doesn't have one already, you need Class E privileges to -define and save a DCSS. - -Example: --------- - -:: - - modprobe monreader mondcss=MYDCSS - -This loads the module and sets the DCSS name to "MYDCSS". - -NOTE: ------ -This API provides no interface to control the `*MONITOR` service, e.g. specify -which data should be collected. This can be done by the CP command MONITOR -(Class E privileged), see "CP Command and Utility Reference". - -Device nodes with udev: ------------------------ -After loading the module, a char device will be created along with the device -node //monreader. - -Device nodes without udev: --------------------------- -If your distribution does not support udev, a device node will not be created -automatically and you have to create it manually after loading the module. -Therefore you need to know the major and minor numbers of the device. These -numbers can be found in /sys/class/misc/monreader/dev. - -Typing cat /sys/class/misc/monreader/dev will give an output of the form -:. The device node can be created via the mknod command, enter -mknod c , where is the name of the device node -to be created. - -Example: --------- - -:: - - # modprobe monreader - # cat /sys/class/misc/monreader/dev - 10:63 - # mknod /dev/monreader c 10 63 - -This loads the module with the default monitor DCSS (MONDCSS) and creates a -device node. - -File operations: ----------------- -The following file operations are supported: open, release, read, poll. -There are two alternative methods for reading: either non-blocking read in -conjunction with polling, or blocking read without polling. IOCTLs are not -supported. - -Read: ------ -Reading from the device provides a 12 Byte monitor control element (MCE), -followed by a set of one or more contiguous monitor records (similar to the -output of the CMS utility MONWRITE without the 4K control blocks). The MCE -contains information on the type of the following record set (sample/event -data), the monitor domains contained within it and the start and end address -of the record set in the monitor DCSS. The start and end address can be used -to determine the size of the record set, the end address is the address of the -last byte of data. The start address is needed to handle "end-of-frame" records -correctly (domain 1, record 13), i.e. it can be used to determine the record -start offset relative to a 4K page (frame) boundary. - -See "Appendix A: `*MONITOR`" in the "z/VM Performance" document for a description -of the monitor control element layout. The layout of the monitor records can -be found here (z/VM 5.1): https://www.vm.ibm.com/pubs/mon510/index.html - -The layout of the data stream provided by the monreader device is as follows:: - - ... - <0 byte read> - \ - | - ... |- data set - | - / - <0 byte read> - ... - -There may be more than one combination of MCE and corresponding record set -within one data set and the end of each data set is indicated by a successful -read with a return value of 0 (0 byte read). -Any received data must be considered invalid until a complete set was -read successfully, including the closing 0 byte read. Therefore you should -always read the complete set into a buffer before processing the data. - -The maximum size of a data set can be as large as the size of the -monitor DCSS, so design the buffer adequately or use dynamic memory allocation. -The size of the monitor DCSS will be printed into syslog after loading the -module. You can also use the (Class E privileged) CP command Q NSS MAP to -list all available segments and information about them. - -As with most char devices, error conditions are indicated by returning a -negative value for the number of bytes read. In this case, the errno variable -indicates the error condition: - -EIO: - reply failed, read data is invalid and the application - should discard the data read since the last successful read with 0 size. -EFAULT: - copy_to_user failed, read data is invalid and the application should - discard the data read since the last successful read with 0 size. -EAGAIN: - occurs on a non-blocking read if there is no data available at the - moment. There is no data missing or corrupted, just try again or rather - use polling for non-blocking reads. -EOVERFLOW: - message limit reached, the data read since the last successful - read with 0 size is valid but subsequent records may be missing. - -In the last case (EOVERFLOW) there may be missing data, in the first two cases -(EIO, EFAULT) there will be missing data. It's up to the application if it will -continue reading subsequent data or rather exit. - -Open: ------ -Only one user is allowed to open the char device. If it is already in use, the -open function will fail (return a negative value) and set errno to EBUSY. -The open function may also fail if an IUCV connection to the `*MONITOR` service -cannot be established. In this case errno will be set to EIO and an error -message with an IPUSER SEVER code will be printed into syslog. The IPUSER SEVER -codes are described in the "z/VM Performance" book, Appendix A. - -NOTE: ------ -As soon as the device is opened, incoming messages will be accepted and they -will account for the message limit, i.e. opening the device without reading -from it will provoke the "message limit reached" error (EOVERFLOW error code) -eventually. diff --git a/Documentation/s390/pci.rst b/Documentation/s390/pci.rst deleted file mode 100644 index a1a72a47dc96..000000000000 --- a/Documentation/s390/pci.rst +++ /dev/null @@ -1,133 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -========= -S/390 PCI -========= - -Authors: - - Pierre Morel - -Copyright, IBM Corp. 2020 - - -Command line parameters and debugfs entries -=========================================== - -Command line parameters ------------------------ - -* nomio - - Do not use PCI Mapped I/O (MIO) instructions. - -* norid - - Ignore the RID field and force use of one PCI domain per PCI function. - -debugfs entries ---------------- - -The S/390 debug feature (s390dbf) generates views to hold various debug results in sysfs directories of the form: - - * /sys/kernel/debug/s390dbf/pci_*/ - -For example: - - - /sys/kernel/debug/s390dbf/pci_msg/sprintf - Holds messages from the processing of PCI events, like machine check handling - and setting of global functionality, like UID checking. - - Change the level of logging to be more or less verbose by piping - a number between 0 and 6 to /sys/kernel/debug/s390dbf/pci_*/level. For - details, see the documentation on the S/390 debug feature at - Documentation/s390/s390dbf.rst. - -Sysfs entries -============= - -Entries specific to zPCI functions and entries that hold zPCI information. - -* /sys/bus/pci/slots/XXXXXXXX - - The slot entries are set up using the function identifier (FID) of the - PCI function. The format depicted as XXXXXXXX above is 8 hexadecimal digits - with 0 padding and lower case hexadecimal digits. - - - /sys/bus/pci/slots/XXXXXXXX/power - - A physical function that currently supports a virtual function cannot be - powered off until all virtual functions are removed with: - echo 0 > /sys/bus/pci/devices/XXXX:XX:XX.X/sriov_numvf - -* /sys/bus/pci/devices/XXXX:XX:XX.X/ - - - function_id - A zPCI function identifier that uniquely identifies the function in the Z server. - - - function_handle - Low-level identifier used for a configured PCI function. - It might be useful for debugging. - - - pchid - Model-dependent location of the I/O adapter. - - - pfgid - PCI function group ID, functions that share identical functionality - use a common identifier. - A PCI group defines interrupts, IOMMU, IOTLB, and DMA specifics. - - - vfn - The virtual function number, from 1 to N for virtual functions, - 0 for physical functions. - - - pft - The PCI function type - - - port - The port corresponds to the physical port the function is attached to. - It also gives an indication of the physical function a virtual function - is attached to. - - - uid - The user identifier (UID) may be defined as part of the machine - configuration or the z/VM or KVM guest configuration. If the accompanying - uid_is_unique attribute is 1 the platform guarantees that the UID is unique - within that instance and no devices with the same UID can be attached - during the lifetime of the system. - - - uid_is_unique - Indicates whether the user identifier (UID) is guaranteed to be and remain - unique within this Linux instance. - - - pfip/segmentX - The segments determine the isolation of a function. - They correspond to the physical path to the function. - The more the segments are different, the more the functions are isolated. - -Enumeration and hotplug -======================= - -The PCI address consists of four parts: domain, bus, device and function, -and is of this form: DDDD:BB:dd.f - -* When not using multi-functions (norid is set, or the firmware does not - support multi-functions): - - - There is only one function per domain. - - - The domain is set from the zPCI function's UID as defined during the - LPAR creation. - -* When using multi-functions (norid parameter is not set), - zPCI functions are addressed differently: - - - There is still only one bus per domain. - - - There can be up to 256 functions per bus. - - - The domain part of the address of all functions for - a multi-Function device is set from the zPCI function's UID as defined - in the LPAR creation for the function zero. - - - New functions will only be ready for use after the function zero - (the function with devfn 0) has been enumerated. diff --git a/Documentation/s390/qeth.rst b/Documentation/s390/qeth.rst deleted file mode 100644 index f02fdaa68de0..000000000000 --- a/Documentation/s390/qeth.rst +++ /dev/null @@ -1,64 +0,0 @@ -============================= -IBM s390 QDIO Ethernet Driver -============================= - -OSA and HiperSockets Bridge Port Support -======================================== - -Uevents -------- - -To generate the events the device must be assigned a role of either -a primary or a secondary Bridge Port. For more information, see -"z/VM Connectivity, SC24-6174". - -When run on an OSA or HiperSockets Bridge Capable Port hardware, and the state -of some configured Bridge Port device on the channel changes, a udev -event with ACTION=CHANGE is emitted on behalf of the corresponding -ccwgroup device. The event has the following attributes: - -BRIDGEPORT=statechange - indicates that the Bridge Port device changed - its state. - -ROLE={primary|secondary|none} - the role assigned to the port. - -STATE={active|standby|inactive} - the newly assumed state of the port. - -When run on HiperSockets Bridge Capable Port hardware with host address -notifications enabled, a udev event with ACTION=CHANGE is emitted. -It is emitted on behalf of the corresponding ccwgroup device when a host -or a VLAN is registered or unregistered on the network served by the device. -The event has the following attributes: - -BRIDGEDHOST={reset|register|deregister|abort} - host address - notifications are started afresh, a new host or VLAN is registered or - deregistered on the Bridge Port HiperSockets channel, or address - notifications are aborted. - -VLAN=numeric-vlan-id - VLAN ID on which the event occurred. Not included - if no VLAN is involved in the event. - -MAC=xx:xx:xx:xx:xx:xx - MAC address of the host that is being registered - or deregistered from the HiperSockets channel. Not reported if the - event reports the creation or destruction of a VLAN. - -NTOK_BUSID=x.y.zzzz - device bus ID (CSSID, SSID and device number). - -NTOK_IID=xx - device IID. - -NTOK_CHPID=xx - device CHPID. - -NTOK_CHID=xxxx - device channel ID. - -Note that the `NTOK_*` attributes refer to devices other than the one -connected to the system on which the OS is running. diff --git a/Documentation/s390/s390dbf.rst b/Documentation/s390/s390dbf.rst deleted file mode 100644 index af8bdc3629e7..000000000000 --- a/Documentation/s390/s390dbf.rst +++ /dev/null @@ -1,478 +0,0 @@ -================== -S390 Debug Feature -================== - -files: - - arch/s390/kernel/debug.c - - arch/s390/include/asm/debug.h - -Description: ------------- -The goal of this feature is to provide a kernel debug logging API -where log records can be stored efficiently in memory, where each component -(e.g. device drivers) can have one separate debug log. -One purpose of this is to inspect the debug logs after a production system crash -in order to analyze the reason for the crash. - -If the system still runs but only a subcomponent which uses dbf fails, -it is possible to look at the debug logs on a live system via the Linux -debugfs filesystem. - -The debug feature may also very useful for kernel and driver development. - -Design: -------- -Kernel components (e.g. device drivers) can register themselves at the debug -feature with the function call :c:func:`debug_register()`. -This function initializes a -debug log for the caller. For each debug log exists a number of debug areas -where exactly one is active at one time. Each debug area consists of contiguous -pages in memory. In the debug areas there are stored debug entries (log records) -which are written by event- and exception-calls. - -An event-call writes the specified debug entry to the active debug -area and updates the log pointer for the active area. If the end -of the active debug area is reached, a wrap around is done (ring buffer) -and the next debug entry will be written at the beginning of the active -debug area. - -An exception-call writes the specified debug entry to the log and -switches to the next debug area. This is done in order to be sure -that the records which describe the origin of the exception are not -overwritten when a wrap around for the current area occurs. - -The debug areas themselves are also ordered in form of a ring buffer. -When an exception is thrown in the last debug area, the following debug -entries are then written again in the very first area. - -There are four versions for the event- and exception-calls: One for -logging raw data, one for text, one for numbers (unsigned int and long), -and one for sprintf-like formatted strings. - -Each debug entry contains the following data: - -- Timestamp -- Cpu-Number of calling task -- Level of debug entry (0...6) -- Return Address to caller -- Flag, if entry is an exception or not - -The debug logs can be inspected in a live system through entries in -the debugfs-filesystem. Under the toplevel directory "``s390dbf``" there is -a directory for each registered component, which is named like the -corresponding component. The debugfs normally should be mounted to -``/sys/kernel/debug`` therefore the debug feature can be accessed under -``/sys/kernel/debug/s390dbf``. - -The content of the directories are files which represent different views -to the debug log. Each component can decide which views should be -used through registering them with the function :c:func:`debug_register_view()`. -Predefined views for hex/ascii and sprintf data are provided. -It is also possible to define other views. The content of -a view can be inspected simply by reading the corresponding debugfs file. - -All debug logs have an actual debug level (range from 0 to 6). -The default level is 3. Event and Exception functions have a :c:data:`level` -parameter. Only debug entries with a level that is lower or equal -than the actual level are written to the log. This means, when -writing events, high priority log entries should have a low level -value whereas low priority entries should have a high one. -The actual debug level can be changed with the help of the debugfs-filesystem -through writing a number string "x" to the ``level`` debugfs file which is -provided for every debug log. Debugging can be switched off completely -by using "-" on the ``level`` debugfs file. - -Example:: - - > echo "-" > /sys/kernel/debug/s390dbf/dasd/level - -It is also possible to deactivate the debug feature globally for every -debug log. You can change the behavior using 2 sysctl parameters in -``/proc/sys/s390dbf``: - -There are currently 2 possible triggers, which stop the debug feature -globally. The first possibility is to use the ``debug_active`` sysctl. If -set to 1 the debug feature is running. If ``debug_active`` is set to 0 the -debug feature is turned off. - -The second trigger which stops the debug feature is a kernel oops. -That prevents the debug feature from overwriting debug information that -happened before the oops. After an oops you can reactivate the debug feature -by piping 1 to ``/proc/sys/s390dbf/debug_active``. Nevertheless, it's not -suggested to use an oopsed kernel in a production environment. - -If you want to disallow the deactivation of the debug feature, you can use -the ``debug_stoppable`` sysctl. If you set ``debug_stoppable`` to 0 the debug -feature cannot be stopped. If the debug feature is already stopped, it -will stay deactivated. - -Kernel Interfaces: ------------------- - -.. kernel-doc:: arch/s390/kernel/debug.c -.. kernel-doc:: arch/s390/include/asm/debug.h - -Predefined views: ------------------ - -.. code-block:: c - - extern struct debug_view debug_hex_ascii_view; - - extern struct debug_view debug_sprintf_view; - -Examples --------- - -.. code-block:: c - - /* - * hex_ascii-view Example - */ - - #include - #include - - static debug_info_t *debug_info; - - static int init(void) - { - /* register 4 debug areas with one page each and 4 byte data field */ - - debug_info = debug_register("test", 1, 4, 4 ); - debug_register_view(debug_info, &debug_hex_ascii_view); - - debug_text_event(debug_info, 4 , "one "); - debug_int_exception(debug_info, 4, 4711); - debug_event(debug_info, 3, &debug_info, 4); - - return 0; - } - - static void cleanup(void) - { - debug_unregister(debug_info); - } - - module_init(init); - module_exit(cleanup); - -.. code-block:: c - - /* - * sprintf-view Example - */ - - #include - #include - - static debug_info_t *debug_info; - - static int init(void) - { - /* register 4 debug areas with one page each and data field for */ - /* format string pointer + 2 varargs (= 3 * sizeof(long)) */ - - debug_info = debug_register("test", 1, 4, sizeof(long) * 3); - debug_register_view(debug_info, &debug_sprintf_view); - - debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__); - debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info); - - return 0; - } - - static void cleanup(void) - { - debug_unregister(debug_info); - } - - module_init(init); - module_exit(cleanup); - -Debugfs Interface ------------------ -Views to the debug logs can be investigated through reading the corresponding -debugfs-files: - -Example:: - - > ls /sys/kernel/debug/s390dbf/dasd - flush hex_ascii level pages - > cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s - 00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | .... - 00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE - 00 00974733272:682213 2 - 02 0006adf6 07 ea 4a 90 | .... - 00 00974733272:682281 1 * 02 0006ab08 41 4c 4c 43 | EXCP - 01 00974733272:682284 2 - 02 0006ab16 45 43 4b 44 | ECKD - 01 00974733272:682287 2 - 02 0006ab28 00 00 00 04 | .... - 01 00974733272:682289 2 - 02 0006ab3e 00 00 00 20 | ... - 01 00974733272:682297 2 - 02 0006ad7e 07 ea 4a 90 | .... - 01 00974733272:684384 2 - 00 0006ade6 46 52 45 45 | FREE - 01 00974733272:684388 2 - 00 0006adf6 07 ea 4a 90 | .... - -See section about predefined views for explanation of the above output! - -Changing the debug level ------------------------- - -Example:: - - - > cat /sys/kernel/debug/s390dbf/dasd/level - 3 - > echo "5" > /sys/kernel/debug/s390dbf/dasd/level - > cat /sys/kernel/debug/s390dbf/dasd/level - 5 - -Flushing debug areas --------------------- -Debug areas can be flushed with piping the number of the desired -area (0...n) to the debugfs file "flush". When using "-" all debug areas -are flushed. - -Examples: - -1. Flush debug area 0:: - - > echo "0" > /sys/kernel/debug/s390dbf/dasd/flush - -2. Flush all debug areas:: - - > echo "-" > /sys/kernel/debug/s390dbf/dasd/flush - -Changing the size of debug areas ------------------------------------- -It is possible the change the size of debug areas through piping -the number of pages to the debugfs file "pages". The resize request will -also flush the debug areas. - -Example: - -Define 4 pages for the debug areas of debug feature "dasd":: - - > echo "4" > /sys/kernel/debug/s390dbf/dasd/pages - -Stopping the debug feature --------------------------- -Example: - -1. Check if stopping is allowed:: - - > cat /proc/sys/s390dbf/debug_stoppable - -2. Stop debug feature:: - - > echo 0 > /proc/sys/s390dbf/debug_active - -crash Interface ----------------- -The ``crash`` tool since v5.1.0 has a built-in command -``s390dbf`` to display all the debug logs or export them to the file system. -With this tool it is possible -to investigate the debug logs on a live system and with a memory dump after -a system crash. - -Investigating raw memory ------------------------- -One last possibility to investigate the debug logs at a live -system and after a system crash is to look at the raw memory -under VM or at the Service Element. -It is possible to find the anchor of the debug-logs through -the ``debug_area_first`` symbol in the System map. Then one has -to follow the correct pointers of the data-structures defined -in debug.h and find the debug-areas in memory. -Normally modules which use the debug feature will also have -a global variable with the pointer to the debug-logs. Following -this pointer it will also be possible to find the debug logs in -memory. - -For this method it is recommended to use '16 * x + 4' byte (x = 0..n) -for the length of the data field in :c:func:`debug_register()` in -order to see the debug entries well formatted. - - -Predefined Views ----------------- - -There are two predefined views: hex_ascii and sprintf. -The hex_ascii view shows the data field in hex and ascii representation -(e.g. ``45 43 4b 44 | ECKD``). - -The sprintf view formats the debug entries in the same way as the sprintf -function would do. The sprintf event/exception functions write to the -debug entry a pointer to the format string (size = sizeof(long)) -and for each vararg a long value. So e.g. for a debug entry with a format -string plus two varargs one would need to allocate a (3 * sizeof(long)) -byte data area in the debug_register() function. - -IMPORTANT: - Using "%s" in sprintf event functions is dangerous. You can only - use "%s" in the sprintf event functions, if the memory for the passed string - is available as long as the debug feature exists. The reason behind this is - that due to performance considerations only a pointer to the string is stored - in the debug feature. If you log a string that is freed afterwards, you will - get an OOPS when inspecting the debug feature, because then the debug feature - will access the already freed memory. - -NOTE: - If using the sprintf view do NOT use other event/exception functions - than the sprintf-event and -exception functions. - -The format of the hex_ascii and sprintf view is as follows: - -- Number of area -- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated - Universal Time (UTC), January 1, 1970) -- level of debug entry -- Exception flag (* = Exception) -- Cpu-Number of calling task -- Return Address to caller -- data field - -A typical line of the hex_ascii view will look like the following (first line -is only for explanation and will not be displayed when 'cating' the view):: - - area time level exception cpu caller data (hex + ascii) - -------------------------------------------------------------------------- - 00 00964419409:440690 1 - 00 88023fe - - -Defining views --------------- - -Views are specified with the 'debug_view' structure. There are defined -callback functions which are used for reading and writing the debugfs files: - -.. code-block:: c - - struct debug_view { - char name[DEBUG_MAX_PROCF_LEN]; - debug_prolog_proc_t* prolog_proc; - debug_header_proc_t* header_proc; - debug_format_proc_t* format_proc; - debug_input_proc_t* input_proc; - void* private_data; - }; - -where: - -.. code-block:: c - - typedef int (debug_header_proc_t) (debug_info_t* id, - struct debug_view* view, - int area, - debug_entry_t* entry, - char* out_buf); - - typedef int (debug_format_proc_t) (debug_info_t* id, - struct debug_view* view, char* out_buf, - const char* in_buf); - typedef int (debug_prolog_proc_t) (debug_info_t* id, - struct debug_view* view, - char* out_buf); - typedef int (debug_input_proc_t) (debug_info_t* id, - struct debug_view* view, - struct file* file, const char* user_buf, - size_t in_buf_size, loff_t* offset); - - -The "private_data" member can be used as pointer to view specific data. -It is not used by the debug feature itself. - -The output when reading a debugfs file is structured like this:: - - "prolog_proc output" - - "header_proc output 1" "format_proc output 1" - "header_proc output 2" "format_proc output 2" - "header_proc output 3" "format_proc output 3" - ... - -When a view is read from the debugfs, the Debug Feature calls the -'prolog_proc' once for writing the prolog. -Then 'header_proc' and 'format_proc' are called for each -existing debug entry. - -The input_proc can be used to implement functionality when it is written to -the view (e.g. like with ``echo "0" > /sys/kernel/debug/s390dbf/dasd/level``). - -For header_proc there can be used the default function -:c:func:`debug_dflt_header_fn()` which is defined in debug.h. -and which produces the same header output as the predefined views. -E.g:: - - 00 00964419409:440761 2 - 00 88023ec - -In order to see how to use the callback functions check the implementation -of the default views! - -Example: - -.. code-block:: c - - #include - - #define UNKNOWNSTR "data: %08x" - - const char* messages[] = - {"This error...........\n", - "That error...........\n", - "Problem..............\n", - "Something went wrong.\n", - "Everything ok........\n", - NULL - }; - - static int debug_test_format_fn( - debug_info_t *id, struct debug_view *view, - char *out_buf, const char *in_buf - ) - { - int i, rc = 0; - - if (id->buf_size >= 4) { - int msg_nr = *((int*)in_buf); - if (msg_nr < sizeof(messages) / sizeof(char*) - 1) - rc += sprintf(out_buf, "%s", messages[msg_nr]); - else - rc += sprintf(out_buf, UNKNOWNSTR, msg_nr); - } - return rc; - } - - struct debug_view debug_test_view = { - "myview", /* name of view */ - NULL, /* no prolog */ - &debug_dflt_header_fn, /* default header for each entry */ - &debug_test_format_fn, /* our own format function */ - NULL, /* no input function */ - NULL /* no private data */ - }; - -test: -===== - -.. code-block:: c - - debug_info_t *debug_info; - int i; - ... - debug_info = debug_register("test", 0, 4, 4); - debug_register_view(debug_info, &debug_test_view); - for (i = 0; i < 10; i ++) - debug_int_event(debug_info, 1, i); - -:: - - > cat /sys/kernel/debug/s390dbf/test/myview - 00 00964419734:611402 1 - 00 88042ca This error........... - 00 00964419734:611405 1 - 00 88042ca That error........... - 00 00964419734:611408 1 - 00 88042ca Problem.............. - 00 00964419734:611411 1 - 00 88042ca Something went wrong. - 00 00964419734:611414 1 - 00 88042ca Everything ok........ - 00 00964419734:611417 1 - 00 88042ca data: 00000005 - 00 00964419734:611419 1 - 00 88042ca data: 00000006 - 00 00964419734:611422 1 - 00 88042ca data: 00000007 - 00 00964419734:611425 1 - 00 88042ca data: 00000008 - 00 00964419734:611428 1 - 00 88042ca data: 00000009 diff --git a/Documentation/s390/text_files.rst b/Documentation/s390/text_files.rst deleted file mode 100644 index c94d05d4fa17..000000000000 --- a/Documentation/s390/text_files.rst +++ /dev/null @@ -1,11 +0,0 @@ -ibm 3270 changelog ------------------- - -.. include:: 3270.ChangeLog - :literal: - -ibm 3270 config3270.sh ----------------------- - -.. literalinclude:: config3270.sh - :language: shell diff --git a/Documentation/s390/vfio-ap-locking.rst b/Documentation/s390/vfio-ap-locking.rst deleted file mode 100644 index 0dfcdb562e21..000000000000 --- a/Documentation/s390/vfio-ap-locking.rst +++ /dev/null @@ -1,115 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -====================== -VFIO AP Locks Overview -====================== -This document describes the locks that are pertinent to the secure operation -of the vfio_ap device driver. Throughout this document, the following variables -will be used to denote instances of the structures herein described: - -.. code-block:: c - - struct ap_matrix_dev *matrix_dev; - struct ap_matrix_mdev *matrix_mdev; - struct kvm *kvm; - -The Matrix Devices Lock (drivers/s390/crypto/vfio_ap_private.h) ---------------------------------------------------------------- - -.. code-block:: c - - struct ap_matrix_dev { - ... - struct list_head mdev_list; - struct mutex mdevs_lock; - ... - } - -The Matrix Devices Lock (matrix_dev->mdevs_lock) is implemented as a global -mutex contained within the single object of struct ap_matrix_dev. This lock -controls access to all fields contained within each matrix_mdev -(matrix_dev->mdev_list). This lock must be held while reading from, writing to -or using the data from a field contained within a matrix_mdev instance -representing one of the vfio_ap device driver's mediated devices. - -The KVM Lock (include/linux/kvm_host.h) ---------------------------------------- - -.. code-block:: c - - struct kvm { - ... - struct mutex lock; - ... - } - -The KVM Lock (kvm->lock) controls access to the state data for a KVM guest. This -lock must be held by the vfio_ap device driver while one or more AP adapters, -domains or control domains are being plugged into or unplugged from the guest. - -The KVM pointer is stored in the in the matrix_mdev instance -(matrix_mdev->kvm = kvm) containing the state of the mediated device that has -been attached to the KVM guest. - -The Guests Lock (drivers/s390/crypto/vfio_ap_private.h) ------------------------------------------------------------ - -.. code-block:: c - - struct ap_matrix_dev { - ... - struct list_head mdev_list; - struct mutex guests_lock; - ... - } - -The Guests Lock (matrix_dev->guests_lock) controls access to the -matrix_mdev instances (matrix_dev->mdev_list) that represent mediated devices -that hold the state for the mediated devices that have been attached to a -KVM guest. This lock must be held: - -1. To control access to the KVM pointer (matrix_mdev->kvm) while the vfio_ap - device driver is using it to plug/unplug AP devices passed through to the KVM - guest. - -2. To add matrix_mdev instances to or remove them from matrix_dev->mdev_list. - This is necessary to ensure the proper locking order when the list is perused - to find an ap_matrix_mdev instance for the purpose of plugging/unplugging - AP devices passed through to a KVM guest. - - For example, when a queue device is removed from the vfio_ap device driver, - if the adapter is passed through to a KVM guest, it will have to be - unplugged. In order to figure out whether the adapter is passed through, - the matrix_mdev object to which the queue is assigned will have to be - found. The KVM pointer (matrix_mdev->kvm) can then be used to determine if - the mediated device is passed through (matrix_mdev->kvm != NULL) and if so, - to unplug the adapter. - -It is not necessary to take the Guests Lock to access the KVM pointer if the -pointer is not used to plug/unplug devices passed through to the KVM guest; -however, in this case, the Matrix Devices Lock (matrix_dev->mdevs_lock) must be -held in order to access the KVM pointer since it is set and cleared under the -protection of the Matrix Devices Lock. A case in point is the function that -handles interception of the PQAP(AQIC) instruction sub-function. This handler -needs to access the KVM pointer only for the purposes of setting or clearing IRQ -resources, so only the matrix_dev->mdevs_lock needs to be held. - -The PQAP Hook Lock (arch/s390/include/asm/kvm_host.h) ------------------------------------------------------ - -.. code-block:: c - - typedef int (*crypto_hook)(struct kvm_vcpu *vcpu); - - struct kvm_s390_crypto { - ... - struct rw_semaphore pqap_hook_rwsem; - crypto_hook *pqap_hook; - ... - }; - -The PQAP Hook Lock is a r/w semaphore that controls access to the function -pointer of the handler ``(*kvm->arch.crypto.pqap_hook)`` to invoke when the -PQAP(AQIC) instruction sub-function is intercepted by the host. The lock must be -held in write mode when pqap_hook value is set, and in read mode when the -pqap_hook function is called. diff --git a/Documentation/s390/vfio-ap.rst b/Documentation/s390/vfio-ap.rst deleted file mode 100644 index bb3f4c4e2885..000000000000 --- a/Documentation/s390/vfio-ap.rst +++ /dev/null @@ -1,1069 +0,0 @@ -=============================== -Adjunct Processor (AP) facility -=============================== - - -Introduction -============ -The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised -of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards. -The AP devices provide cryptographic functions to all CPUs assigned to a -linux system running in an IBM Z system LPAR. - -The AP adapter cards are exposed via the AP bus. The motivation for vfio-ap -is to make AP cards available to KVM guests using the VFIO mediated device -framework. This implementation relies considerably on the s390 virtualization -facilities which do most of the hard work of providing direct access to AP -devices. - -AP Architectural Overview -========================= -To facilitate the comprehension of the design, let's start with some -definitions: - -* AP adapter - - An AP adapter is an IBM Z adapter card that can perform cryptographic - functions. There can be from 0 to 256 adapters assigned to an LPAR. Adapters - assigned to the LPAR in which a linux host is running will be available to - the linux host. Each adapter is identified by a number from 0 to 255; however, - the maximum adapter number is determined by machine model and/or adapter type. - When installed, an AP adapter is accessed by AP instructions executed by any - CPU. - - The AP adapter cards are assigned to a given LPAR via the system's Activation - Profile which can be edited via the HMC. When the linux host system is IPL'd - in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and - creates a sysfs device for each assigned adapter. For example, if AP adapters - 4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following - sysfs device entries:: - - /sys/devices/ap/card04 - /sys/devices/ap/card0a - - Symbolic links to these devices will also be created in the AP bus devices - sub-directory:: - - /sys/bus/ap/devices/[card04] - /sys/bus/ap/devices/[card04] - -* AP domain - - An adapter is partitioned into domains. An adapter can hold up to 256 domains - depending upon the adapter type and hardware configuration. A domain is - identified by a number from 0 to 255; however, the maximum domain number is - determined by machine model and/or adapter type.. A domain can be thought of - as a set of hardware registers and memory used for processing AP commands. A - domain can be configured with a secure private key used for clear key - encryption. A domain is classified in one of two ways depending upon how it - may be accessed: - - * Usage domains are domains that are targeted by an AP instruction to - process an AP command. - - * Control domains are domains that are changed by an AP command sent to a - usage domain; for example, to set the secure private key for the control - domain. - - The AP usage and control domains are assigned to a given LPAR via the system's - Activation Profile which can be edited via the HMC. When a linux host system - is IPL'd in the LPAR, the AP bus module detects the AP usage and control - domains assigned to the LPAR. The domain number of each usage domain and - adapter number of each AP adapter are combined to create AP queue devices - (see AP Queue section below). The domain number of each control domain will be - represented in a bitmask and stored in a sysfs file - /sys/bus/ap/ap_control_domain_mask. The bits in the mask, from most to least - significant bit, correspond to domains 0-255. - -* AP Queue - - An AP queue is the means by which an AP command is sent to a usage domain - inside a specific adapter. An AP queue is identified by a tuple - comprised of an AP adapter ID (APID) and an AP queue index (APQI). The - APQI corresponds to a given usage domain number within the adapter. This tuple - forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP - instructions include a field containing the APQN to identify the AP queue to - which the AP command is to be sent for processing. - - The AP bus will create a sysfs device for each APQN that can be derived from - the cross product of the AP adapter and usage domain numbers detected when the - AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage - domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the - following sysfs entries:: - - /sys/devices/ap/card04/04.0006 - /sys/devices/ap/card04/04.0047 - /sys/devices/ap/card0a/0a.0006 - /sys/devices/ap/card0a/0a.0047 - - The following symbolic links to these devices will be created in the AP bus - devices subdirectory:: - - /sys/bus/ap/devices/[04.0006] - /sys/bus/ap/devices/[04.0047] - /sys/bus/ap/devices/[0a.0006] - /sys/bus/ap/devices/[0a.0047] - -* AP Instructions: - - There are three AP instructions: - - * NQAP: to enqueue an AP command-request message to a queue - * DQAP: to dequeue an AP command-reply message from a queue - * PQAP: to administer the queues - - AP instructions identify the domain that is targeted to process the AP - command; this must be one of the usage domains. An AP command may modify a - domain that is not one of the usage domains, but the modified domain - must be one of the control domains. - -AP and SIE -========== -Let's now take a look at how AP instructions executed on a guest are interpreted -by the hardware. - -A satellite control block called the Crypto Control Block (CRYCB) is attached to -our main hardware virtualization control block. The CRYCB contains an AP Control -Block (APCB) that has three fields to identify the adapters, usage domains and -control domains assigned to the KVM guest: - -* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned - to the KVM guest. Each bit in the mask, from left to right, corresponds to - an APID from 0-255. If a bit is set, the corresponding adapter is valid for - use by the KVM guest. - -* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains - assigned to the KVM guest. Each bit in the mask, from left to right, - corresponds to an AP queue index (APQI) from 0-255. If a bit is set, the - corresponding queue is valid for use by the KVM guest. - -* The AP Domain Mask field is a bit mask that identifies the AP control domains - assigned to the KVM guest. The ADM bit mask controls which domains can be - changed by an AP command-request message sent to a usage domain from the - guest. Each bit in the mask, from left to right, corresponds to a domain from - 0-255. If a bit is set, the corresponding domain can be modified by an AP - command-request message sent to a usage domain. - -If you recall from the description of an AP Queue, AP instructions include -an APQN to identify the AP queue to which an AP command-request message is to be -sent (NQAP and PQAP instructions), or from which a command-reply message is to -be received (DQAP instruction). The validity of an APQN is defined by the matrix -calculated from the APM and AQM; it is the Cartesian product of all assigned -adapter numbers (APM) with all assigned queue indexes (AQM). For example, if -adapters 1 and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs -(1,5), (1,6), (2,5) and (2,6) will be valid for the guest. - -The APQNs can provide secure key functionality - i.e., a private key is stored -on the adapter card for each of its domains - so each APQN must be assigned to -at most one guest or to the linux host:: - - Example 1: Valid configuration: - ------------------------------ - Guest1: adapters 1,2 domains 5,6 - Guest2: adapter 1,2 domain 7 - - This is valid because both guests have a unique set of APQNs: - Guest1 has APQNs (1,5), (1,6), (2,5), (2,6); - Guest2 has APQNs (1,7), (2,7) - - Example 2: Valid configuration: - ------------------------------ - Guest1: adapters 1,2 domains 5,6 - Guest2: adapters 3,4 domains 5,6 - - This is also valid because both guests have a unique set of APQNs: - Guest1 has APQNs (1,5), (1,6), (2,5), (2,6); - Guest2 has APQNs (3,5), (3,6), (4,5), (4,6) - - Example 3: Invalid configuration: - -------------------------------- - Guest1: adapters 1,2 domains 5,6 - Guest2: adapter 1 domains 6,7 - - This is an invalid configuration because both guests have access to - APQN (1,6). - -The Design -========== -The design introduces three new objects: - -1. AP matrix device -2. VFIO AP device driver (vfio_ap.ko) -3. VFIO AP mediated pass-through device - -The VFIO AP device driver -------------------------- -The VFIO AP (vfio_ap) device driver serves the following purposes: - -1. Provides the interfaces to secure APQNs for exclusive use of KVM guests. - -2. Sets up the VFIO mediated device interfaces to manage a vfio_ap mediated - device and creates the sysfs interfaces for assigning adapters, usage - domains, and control domains comprising the matrix for a KVM guest. - -3. Configures the APM, AQM and ADM in the APCB contained in the CRYCB referenced - by a KVM guest's SIE state description to grant the guest access to a matrix - of AP devices - -Reserve APQNs for exclusive use of KVM guests ---------------------------------------------- -The following block diagram illustrates the mechanism by which APQNs are -reserved:: - - +------------------+ - 7 remove | | - +--------------------> cex4queue driver | - | | | - | +------------------+ - | - | - | +------------------+ +----------------+ - | 5 register driver | | 3 create | | - | +----------------> Device core +----------> matrix device | - | | | | | | - | | +--------^---------+ +----------------+ - | | | - | | +-------------------+ - | | +-----------------------------------+ | - | | | 4 register AP driver | | 2 register device - | | | | | - +--------+---+-v---+ +--------+-------+-+ - | | | | - | ap_bus +--------------------- > vfio_ap driver | - | | 8 probe | | - +--------^---------+ +--^--^------------+ - 6 edit | | | - apmask | +-----------------------------+ | 11 mdev create - aqmask | | 1 modprobe | - +--------+-----+---+ +----------------+-+ +----------------+ - | | | |10 create| mediated | - | admin | | VFIO device core |---------> matrix | - | + | | | device | - +------+-+---------+ +--------^---------+ +--------^-------+ - | | | | - | | 9 create vfio_ap-passthrough | | - | +------------------------------+ | - +-------------------------------------------------------------+ - 12 assign adapter/domain/control domain - -The process for reserving an AP queue for use by a KVM guest is: - -1. The administrator loads the vfio_ap device driver -2. The vfio-ap driver during its initialization will register a single 'matrix' - device with the device core. This will serve as the parent device for - all vfio_ap mediated devices used to configure an AP matrix for a guest. -3. The /sys/devices/vfio_ap/matrix device is created by the device core -4. The vfio_ap device driver will register with the AP bus for AP queue devices - of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap - driver's probe and remove callback interfaces. Devices older than CEX4 queues - are not supported to simplify the implementation by not needlessly - complicating the design by supporting older devices that will go out of - service in the relatively near future, and for which there are few older - systems around on which to test. -5. The AP bus registers the vfio_ap device driver with the device core -6. The administrator edits the AP adapter and queue masks to reserve AP queues - for use by the vfio_ap device driver. -7. The AP bus removes the AP queues reserved for the vfio_ap driver from the - default zcrypt cex4queue driver. -8. The AP bus probes the vfio_ap device driver to bind the queues reserved for - it. -9. The administrator creates a passthrough type vfio_ap mediated device to be - used by a guest -10. The administrator assigns the adapters, usage domains and control domains - to be exclusively used by a guest. - -Set up the VFIO mediated device interfaces ------------------------------------------- -The VFIO AP device driver utilizes the common interfaces of the VFIO mediated -device core driver to: - -* Register an AP mediated bus driver to add a vfio_ap mediated device to and - remove it from a VFIO group. -* Create and destroy a vfio_ap mediated device -* Add a vfio_ap mediated device to and remove it from the AP mediated bus driver -* Add a vfio_ap mediated device to and remove it from an IOMMU group - -The following high-level block diagram shows the main components and interfaces -of the VFIO AP mediated device driver:: - - +-------------+ - | | - | +---------+ | mdev_register_driver() +--------------+ - | | Mdev | +<-----------------------+ | - | | bus | | | vfio_mdev.ko | - | | driver | +----------------------->+ |<-> VFIO user - | +---------+ | probe()/remove() +--------------+ APIs - | | - | MDEV CORE | - | MODULE | - | mdev.ko | - | +---------+ | mdev_register_parent() +--------------+ - | |Physical | +<-----------------------+ | - | | device | | | vfio_ap.ko |<-> matrix - | |interface| +----------------------->+ | device - | +---------+ | callback +--------------+ - +-------------+ - -During initialization of the vfio_ap module, the matrix device is registered -with an 'mdev_parent_ops' structure that provides the sysfs attribute -structures, mdev functions and callback interfaces for managing the mediated -matrix device. - -* sysfs attribute structures: - - supported_type_groups - The VFIO mediated device framework supports creation of user-defined - mediated device types. These mediated device types are specified - via the 'supported_type_groups' structure when a device is registered - with the mediated device framework. The registration process creates the - sysfs structures for each mediated device type specified in the - 'mdev_supported_types' sub-directory of the device being registered. Along - with the device type, the sysfs attributes of the mediated device type are - provided. - - The VFIO AP device driver will register one mediated device type for - passthrough devices: - - /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough - - Only the read-only attributes required by the VFIO mdev framework will - be provided:: - - ... name - ... device_api - ... available_instances - ... device_api - - Where: - - * name: - specifies the name of the mediated device type - * device_api: - the mediated device type's API - * available_instances: - the number of vfio_ap mediated passthrough devices - that can be created - * device_api: - specifies the VFIO API - mdev_attr_groups - This attribute group identifies the user-defined sysfs attributes of the - mediated device. When a device is registered with the VFIO mediated device - framework, the sysfs attribute files identified in the 'mdev_attr_groups' - structure will be created in the vfio_ap mediated device's directory. The - sysfs attributes for a vfio_ap mediated device are: - - assign_adapter / unassign_adapter: - Write-only attributes for assigning/unassigning an AP adapter to/from the - vfio_ap mediated device. To assign/unassign an adapter, the APID of the - adapter is echoed into the respective attribute file. - assign_domain / unassign_domain: - Write-only attributes for assigning/unassigning an AP usage domain to/from - the vfio_ap mediated device. To assign/unassign a domain, the domain - number of the usage domain is echoed into the respective attribute - file. - matrix: - A read-only file for displaying the APQNs derived from the Cartesian - product of the adapter and domain numbers assigned to the vfio_ap mediated - device. - guest_matrix: - A read-only file for displaying the APQNs derived from the Cartesian - product of the adapter and domain numbers assigned to the APM and AQM - fields respectively of the KVM guest's CRYCB. This may differ from the - the APQNs assigned to the vfio_ap mediated device if any APQN does not - reference a queue device bound to the vfio_ap device driver (i.e., the - queue is not in the host's AP configuration). - assign_control_domain / unassign_control_domain: - Write-only attributes for assigning/unassigning an AP control domain - to/from the vfio_ap mediated device. To assign/unassign a control domain, - the ID of the domain to be assigned/unassigned is echoed into the - respective attribute file. - control_domains: - A read-only file for displaying the control domain numbers assigned to the - vfio_ap mediated device. - -* functions: - - create: - allocates the ap_matrix_mdev structure used by the vfio_ap driver to: - - * Store the reference to the KVM structure for the guest using the mdev - * Store the AP matrix configuration for the adapters, domains, and control - domains assigned via the corresponding sysfs attributes files - * Store the AP matrix configuration for the adapters, domains and control - domains available to a guest. A guest may not be provided access to APQNs - referencing queue devices that do not exist, or are not bound to the - vfio_ap device driver. - - remove: - deallocates the vfio_ap mediated device's ap_matrix_mdev structure. - This will be allowed only if a running guest is not using the mdev. - -* callback interfaces - - open_device: - The vfio_ap driver uses this callback to register a - VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the matrix mdev - devices. The open_device callback is invoked by userspace to connect the - VFIO iommu group for the matrix mdev device to the MDEV bus. Access to the - KVM structure used to configure the KVM guest is provided via this callback. - The KVM structure, is used to configure the guest's access to the AP matrix - defined via the vfio_ap mediated device's sysfs attribute files. - - close_device: - unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the - matrix mdev device and deconfigures the guest's AP matrix. - - ioctl: - this callback handles the VFIO_DEVICE_GET_INFO and VFIO_DEVICE_RESET ioctls - defined by the vfio framework. - -Configure the guest's AP resources ----------------------------------- -Configuring the AP resources for a KVM guest will be performed when the -VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier -function is called when userspace connects to KVM. The guest's AP resources are -configured via it's APCB by: - -* Setting the bits in the APM corresponding to the APIDs assigned to the - vfio_ap mediated device via its 'assign_adapter' interface. -* Setting the bits in the AQM corresponding to the domains assigned to the - vfio_ap mediated device via its 'assign_domain' interface. -* Setting the bits in the ADM corresponding to the domain dIDs assigned to the - vfio_ap mediated device via its 'assign_control_domains' interface. - -The linux device model precludes passing a device through to a KVM guest that -is not bound to the device driver facilitating its pass-through. Consequently, -an APQN that does not reference a queue device bound to the vfio_ap device -driver will not be assigned to a KVM guest's matrix. The AP architecture, -however, does not provide a means to filter individual APQNs from the guest's -matrix, so the adapters, domains and control domains assigned to vfio_ap -mediated device via its sysfs 'assign_adapter', 'assign_domain' and -'assign_control_domain' interfaces will be filtered before providing the AP -configuration to a guest: - -* The APIDs of the adapters, the APQIs of the domains and the domain numbers of - the control domains assigned to the matrix mdev that are not also assigned to - the host's AP configuration will be filtered. - -* Each APQN derived from the Cartesian product of the APIDs and APQIs assigned - to the vfio_ap mdev is examined and if any one of them does not reference a - queue device bound to the vfio_ap device driver, the adapter will not be - plugged into the guest (i.e., the bit corresponding to its APID will not be - set in the APM of the guest's APCB). - -The CPU model features for AP ------------------------------ -The AP stack relies on the presence of the AP instructions as well as three -facilities: The AP Facilities Test (APFT) facility; the AP Query -Configuration Information (QCI) facility; and the AP Queue Interruption Control -facility. These features/facilities are made available to a KVM guest via the -following CPU model features: - -1. ap: Indicates whether the AP instructions are installed on the guest. This - feature will be enabled by KVM only if the AP instructions are installed - on the host. - -2. apft: Indicates the APFT facility is available on the guest. This facility - can be made available to the guest only if it is available on the host (i.e., - facility bit 15 is set). - -3. apqci: Indicates the AP QCI facility is available on the guest. This facility - can be made available to the guest only if it is available on the host (i.e., - facility bit 12 is set). - -4. apqi: Indicates AP Queue Interruption Control faclity is available on the - guest. This facility can be made available to the guest only if it is - available on the host (i.e., facility bit 65 is set). - -Note: If the user chooses to specify a CPU model different than the 'host' -model to QEMU, the CPU model features and facilities need to be turned on -explicitly; for example:: - - /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on,apqi=on - -A guest can be precluded from using AP features/facilities by turning them off -explicitly; for example:: - - /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off,apqi=off - -Note: If the APFT facility is turned off (apft=off) for the guest, the guest -will not see any AP devices. The zcrypt device drivers on the guest that -register for type 10 and newer AP devices - i.e., the cex4card and cex4queue -device drivers - need the APFT facility to ascertain the facilities installed on -a given AP device. If the APFT facility is not installed on the guest, then no -adapter or domain devices will get created by the AP bus running on the -guest because only type 10 and newer devices can be configured for guest use. - -Example -======= -Let's now provide an example to illustrate how KVM guests may be given -access to AP facilities. For this example, we will show how to configure -three guests such that executing the lszcrypt command on the guests would -look like this: - -Guest1 ------- -=========== ===== ============ -CARD.DOMAIN TYPE MODE -=========== ===== ============ -05 CEX5C CCA-Coproc -05.0004 CEX5C CCA-Coproc -05.00ab CEX5C CCA-Coproc -06 CEX5A Accelerator -06.0004 CEX5A Accelerator -06.00ab CEX5A Accelerator -=========== ===== ============ - -Guest2 ------- -=========== ===== ============ -CARD.DOMAIN TYPE MODE -=========== ===== ============ -05 CEX5C CCA-Coproc -05.0047 CEX5C CCA-Coproc -05.00ff CEX5C CCA-Coproc -=========== ===== ============ - -Guest3 ------- -=========== ===== ============ -CARD.DOMAIN TYPE MODE -=========== ===== ============ -06 CEX5A Accelerator -06.0047 CEX5A Accelerator -06.00ff CEX5A Accelerator -=========== ===== ============ - -These are the steps: - -1. Install the vfio_ap module on the linux host. The dependency chain for the - vfio_ap module is: - * iommu - * s390 - * zcrypt - * vfio - * vfio_mdev - * vfio_mdev_device - * KVM - - To build the vfio_ap module, the kernel build must be configured with the - following Kconfig elements selected: - * IOMMU_SUPPORT - * S390 - * ZCRYPT - * VFIO - * KVM - - If using make menuconfig select the following to build the vfio_ap module:: - - -> Device Drivers - -> IOMMU Hardware Support - select S390 AP IOMMU Support - -> VFIO Non-Privileged userspace driver framework - -> Mediated device driver frramework - -> VFIO driver for Mediated devices - -> I/O subsystem - -> VFIO support for AP devices - -2. Secure the AP queues to be used by the three guests so that the host can not - access them. To secure them, there are two sysfs files that specify - bitmasks marking a subset of the APQN range as usable only by the default AP - queue device drivers. All remaining APQNs are available for use by - any other device driver. The vfio_ap device driver is currently the only - non-default device driver. The location of the sysfs files containing the - masks are:: - - /sys/bus/ap/apmask - /sys/bus/ap/aqmask - - The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs - (APID). Each bit in the mask, from left to right, corresponds to an APID from - 0-255. If a bit is set, the APID belongs to the subset of APQNs marked as - available only to the default AP queue device drivers. - - The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes - (APQI). Each bit in the mask, from left to right, corresponds to an APQI from - 0-255. If a bit is set, the APQI belongs to the subset of APQNs marked as - available only to the default AP queue device drivers. - - The Cartesian product of the APIDs corresponding to the bits set in the - apmask and the APQIs corresponding to the bits set in the aqmask comprise - the subset of APQNs that can be used only by the host default device drivers. - All other APQNs are available to the non-default device drivers such as the - vfio_ap driver. - - Take, for example, the following masks:: - - apmask: - 0x7d00000000000000000000000000000000000000000000000000000000000000 - - aqmask: - 0x8000000000000000000000000000000000000000000000000000000000000000 - - The masks indicate: - - * Adapters 1, 2, 3, 4, 5, and 7 are available for use by the host default - device drivers. - - * Domain 0 is available for use by the host default device drivers - - * The subset of APQNs available for use only by the default host device - drivers are: - - (1,0), (2,0), (3,0), (4.0), (5,0) and (7,0) - - * All other APQNs are available for use by the non-default device drivers. - - The APQN of each AP queue device assigned to the linux host is checked by the - AP bus against the set of APQNs derived from the Cartesian product of APIDs - and APQIs marked as available to the default AP queue device drivers. If a - match is detected, only the default AP queue device drivers will be probed; - otherwise, the vfio_ap device driver will be probed. - - By default, the two masks are set to reserve all APQNs for use by the default - AP queue device drivers. There are two ways the default masks can be changed: - - 1. The sysfs mask files can be edited by echoing a string into the - respective sysfs mask file in one of two formats: - - * An absolute hex string starting with 0x - like "0x12345678" - sets - the mask. If the given string is shorter than the mask, it is padded - with 0s on the right; for example, specifying a mask value of 0x41 is - the same as specifying:: - - 0x4100000000000000000000000000000000000000000000000000000000000000 - - Keep in mind that the mask reads from left to right, so the mask - above identifies device numbers 1 and 7 (01000001). - - If the string is longer than the mask, the operation is terminated with - an error (EINVAL). - - * Individual bits in the mask can be switched on and off by specifying - each bit number to be switched in a comma separated list. Each bit - number string must be prepended with a ('+') or minus ('-') to indicate - the corresponding bit is to be switched on ('+') or off ('-'). Some - valid values are: - - - "+0" switches bit 0 on - - "-13" switches bit 13 off - - "+0x41" switches bit 65 on - - "-0xff" switches bit 255 off - - The following example: - - +0,-6,+0x47,-0xf0 - - Switches bits 0 and 71 (0x47) on - - Switches bits 6 and 240 (0xf0) off - - Note that the bits not specified in the list remain as they were before - the operation. - - 2. The masks can also be changed at boot time via parameters on the kernel - command line like this: - - ap.apmask=0xffff ap.aqmask=0x40 - - This would create the following masks:: - - apmask: - 0xffff000000000000000000000000000000000000000000000000000000000000 - - aqmask: - 0x4000000000000000000000000000000000000000000000000000000000000000 - - Resulting in these two pools:: - - default drivers pool: adapter 0-15, domain 1 - alternate drivers pool: adapter 16-255, domains 0, 2-255 - - **Note:** - Changing a mask such that one or more APQNs will be taken from a vfio_ap - mediated device (see below) will fail with an error (EBUSY). A message - is logged to the kernel ring buffer which can be viewed with the 'dmesg' - command. The output identifies each APQN flagged as 'in use' and identifies - the vfio_ap mediated device to which it is assigned; for example: - - Userspace may not re-assign queue 05.0054 already assigned to 62177883-f1bb-47f0-914d-32a22e3a8804 - Userspace may not re-assign queue 04.0054 already assigned to cef03c3c-903d-4ecc-9a83-40694cb8aee4 - -Securing the APQNs for our example ----------------------------------- - To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047, - 06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding - APQNs can be removed from the default masks using either of the following - commands:: - - echo -5,-6 > /sys/bus/ap/apmask - - echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask - - Or the masks can be set as follows:: - - echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \ - > apmask - - echo 0xf7fffffffffffffffeffffffffffffffffffffffffeffffffffffffffffffffe \ - > aqmask - - This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, - 06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The - sysfs directory for the vfio_ap device driver will now contain symbolic links - to the AP queue devices bound to it:: - - /sys/bus/ap - ... [drivers] - ...... [vfio_ap] - ......... [05.0004] - ......... [05.0047] - ......... [05.00ab] - ......... [05.00ff] - ......... [06.0004] - ......... [06.0047] - ......... [06.00ab] - ......... [06.00ff] - - Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later) - can be bound to the vfio_ap device driver. The reason for this is to - simplify the implementation by not needlessly complicating the design by - supporting older devices that will go out of service in the relatively near - future and for which there are few older systems on which to test. - - The administrator, therefore, must take care to secure only AP queues that - can be bound to the vfio_ap device driver. The device type for a given AP - queue device can be read from the parent card's sysfs directory. For example, - to see the hardware type of the queue 05.0004: - - cat /sys/bus/ap/devices/card05/hwtype - - The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the - vfio_ap device driver. - -3. Create the mediated devices needed to configure the AP matrixes for the - three guests and to provide an interface to the vfio_ap driver for - use by the guests:: - - /sys/devices/vfio_ap/matrix/ - --- [mdev_supported_types] - ------ [vfio_ap-passthrough] (passthrough vfio_ap mediated device type) - --------- create - --------- [devices] - - To create the mediated devices for the three guests:: - - uuidgen > create - uuidgen > create - uuidgen > create - - or - - echo $uuid1 > create - echo $uuid2 > create - echo $uuid3 > create - - This will create three mediated devices in the [devices] subdirectory named - after the UUID written to the create attribute file. We call them $uuid1, - $uuid2 and $uuid3 and this is the sysfs directory structure after creation:: - - /sys/devices/vfio_ap/matrix/ - --- [mdev_supported_types] - ------ [vfio_ap-passthrough] - --------- [devices] - ------------ [$uuid1] - --------------- assign_adapter - --------------- assign_control_domain - --------------- assign_domain - --------------- matrix - --------------- unassign_adapter - --------------- unassign_control_domain - --------------- unassign_domain - - ------------ [$uuid2] - --------------- assign_adapter - --------------- assign_control_domain - --------------- assign_domain - --------------- matrix - --------------- unassign_adapter - ----------------unassign_control_domain - ----------------unassign_domain - - ------------ [$uuid3] - --------------- assign_adapter - --------------- assign_control_domain - --------------- assign_domain - --------------- matrix - --------------- unassign_adapter - ----------------unassign_control_domain - ----------------unassign_domain - - Note *****: The vfio_ap mdevs do not persist across reboots unless the - mdevctl tool is used to create and persist them. - -4. The administrator now needs to configure the matrixes for the mediated - devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3). - - This is how the matrix is configured for Guest1:: - - echo 5 > assign_adapter - echo 6 > assign_adapter - echo 4 > assign_domain - echo 0xab > assign_domain - - Control domains can similarly be assigned using the assign_control_domain - sysfs file. - - If a mistake is made configuring an adapter, domain or control domain, - you can use the unassign_xxx files to unassign the adapter, domain or - control domain. - - To display the matrix configuration for Guest1:: - - cat matrix - - To display the matrix that is or will be assigned to Guest1:: - - cat guest_matrix - - This is how the matrix is configured for Guest2:: - - echo 5 > assign_adapter - echo 0x47 > assign_domain - echo 0xff > assign_domain - - This is how the matrix is configured for Guest3:: - - echo 6 > assign_adapter - echo 0x47 > assign_domain - echo 0xff > assign_domain - - In order to successfully assign an adapter: - - * The adapter number specified must represent a value from 0 up to the - maximum adapter number configured for the system. If an adapter number - higher than the maximum is specified, the operation will terminate with - an error (ENODEV). - - Note: The maximum adapter number can be obtained via the sysfs - /sys/bus/ap/ap_max_adapter_id attribute file. - - * Each APQN derived from the Cartesian product of the APID of the adapter - being assigned and the APQIs of the domains previously assigned: - - - Must only be available to the vfio_ap device driver as specified in the - sysfs /sys/bus/ap/apmask and /sys/bus/ap/aqmask attribute files. If even - one APQN is reserved for use by the host device driver, the operation - will terminate with an error (EADDRNOTAVAIL). - - - Must NOT be assigned to another vfio_ap mediated device. If even one APQN - is assigned to another vfio_ap mediated device, the operation will - terminate with an error (EBUSY). - - - Must NOT be assigned while the sysfs /sys/bus/ap/apmask and - sys/bus/ap/aqmask attribute files are being edited or the operation may - terminate with an error (EBUSY). - - In order to successfully assign a domain: - - * The domain number specified must represent a value from 0 up to the - maximum domain number configured for the system. If a domain number - higher than the maximum is specified, the operation will terminate with - an error (ENODEV). - - Note: The maximum domain number can be obtained via the sysfs - /sys/bus/ap/ap_max_domain_id attribute file. - - * Each APQN derived from the Cartesian product of the APQI of the domain - being assigned and the APIDs of the adapters previously assigned: - - - Must only be available to the vfio_ap device driver as specified in the - sysfs /sys/bus/ap/apmask and /sys/bus/ap/aqmask attribute files. If even - one APQN is reserved for use by the host device driver, the operation - will terminate with an error (EADDRNOTAVAIL). - - - Must NOT be assigned to another vfio_ap mediated device. If even one APQN - is assigned to another vfio_ap mediated device, the operation will - terminate with an error (EBUSY). - - - Must NOT be assigned while the sysfs /sys/bus/ap/apmask and - sys/bus/ap/aqmask attribute files are being edited or the operation may - terminate with an error (EBUSY). - - In order to successfully assign a control domain: - - * The domain number specified must represent a value from 0 up to the maximum - domain number configured for the system. If a control domain number higher - than the maximum is specified, the operation will terminate with an - error (ENODEV). - -5. Start Guest1:: - - /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on,apqi=on \ - -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ... - -7. Start Guest2:: - - /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on,apqi=on \ - -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ... - -7. Start Guest3:: - - /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on,apqi=on \ - -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ... - -When the guest is shut down, the vfio_ap mediated devices may be removed. - -Using our example again, to remove the vfio_ap mediated device $uuid1:: - - /sys/devices/vfio_ap/matrix/ - --- [mdev_supported_types] - ------ [vfio_ap-passthrough] - --------- [devices] - ------------ [$uuid1] - --------------- remove - -:: - - echo 1 > remove - -This will remove all of the matrix mdev device's sysfs structures including -the mdev device itself. To recreate and reconfigure the matrix mdev device, -all of the steps starting with step 3 will have to be performed again. Note -that the remove will fail if a guest using the vfio_ap mdev is still running. - -It is not necessary to remove a vfio_ap mdev, but one may want to -remove it if no guest will use it during the remaining lifetime of the linux -host. If the vfio_ap mdev is removed, one may want to also reconfigure -the pool of adapters and queues reserved for use by the default drivers. - -Hot plug/unplug support: -======================== -An adapter, domain or control domain may be hot plugged into a running KVM -guest by assigning it to the vfio_ap mediated device being used by the guest if -the following conditions are met: - -* The adapter, domain or control domain must also be assigned to the host's - AP configuration. - -* Each APQN derived from the Cartesian product comprised of the APID of the - adapter being assigned and the APQIs of the domains assigned must reference a - queue device bound to the vfio_ap device driver. - -* To hot plug a domain, each APQN derived from the Cartesian product - comprised of the APQI of the domain being assigned and the APIDs of the - adapters assigned must reference a queue device bound to the vfio_ap device - driver. - -An adapter, domain or control domain may be hot unplugged from a running KVM -guest by unassigning it from the vfio_ap mediated device being used by the -guest. - -Over-provisioning of AP queues for a KVM guest: -=============================================== -Over-provisioning is defined herein as the assignment of adapters or domains to -a vfio_ap mediated device that do not reference AP devices in the host's AP -configuration. The idea here is that when the adapter or domain becomes -available, it will be automatically hot-plugged into the KVM guest using -the vfio_ap mediated device to which it is assigned as long as each new APQN -resulting from plugging it in references a queue device bound to the vfio_ap -device driver. - -Limitations -=========== -Live guest migration is not supported for guests using AP devices without -intervention by a system administrator. Before a KVM guest can be migrated, -the vfio_ap mediated device must be removed. Unfortunately, it can not be -removed manually (i.e., echo 1 > /sys/devices/vfio_ap/matrix/$UUID/remove) while -the mdev is in use by a KVM guest. If the guest is being emulated by QEMU, -its mdev can be hot unplugged from the guest in one of two ways: - -1. If the KVM guest was started with libvirt, you can hot unplug the mdev via - the following commands: - - virsh detach-device - - For example, to hot unplug mdev 62177883-f1bb-47f0-914d-32a22e3a8804 from - the guest named 'my-guest': - - virsh detach-device my-guest ~/config/my-guest-hostdev.xml - - The contents of my-guest-hostdev.xml: - -.. code-block:: xml - - - -
- - - - - virsh qemu-monitor-command --hmp "device-del " - - For example, to hot unplug the vfio_ap mediated device identified on the - qemu command line with 'id=hostdev0' from the guest named 'my-guest': - -.. code-block:: sh - - virsh qemu-monitor-command my-guest --hmp "device_del hostdev0" - -2. A vfio_ap mediated device can be hot unplugged by attaching the qemu monitor - to the guest and using the following qemu monitor command: - - (QEMU) device-del id= - - For example, to hot unplug the vfio_ap mediated device that was specified - on the qemu command line with 'id=hostdev0' when the guest was started: - - (QEMU) device-del id=hostdev0 - -After live migration of the KVM guest completes, an AP configuration can be -restored to the KVM guest by hot plugging a vfio_ap mediated device on the target -system into the guest in one of two ways: - -1. If the KVM guest was started with libvirt, you can hot plug a matrix mediated - device into the guest via the following virsh commands: - - virsh attach-device - - For example, to hot plug mdev 62177883-f1bb-47f0-914d-32a22e3a8804 into - the guest named 'my-guest': - - virsh attach-device my-guest ~/config/my-guest-hostdev.xml - - The contents of my-guest-hostdev.xml: - -.. code-block:: xml - - - -
- - - - - virsh qemu-monitor-command --hmp \ - "device_add vfio-ap,sysfsdev=,id=" - - For example, to hot plug the vfio_ap mediated device - 62177883-f1bb-47f0-914d-32a22e3a8804 into the guest named 'my-guest' with - device-id hostdev0: - - virsh qemu-monitor-command my-guest --hmp \ - "device_add vfio-ap,\ - sysfsdev=/sys/devices/vfio_ap/matrix/62177883-f1bb-47f0-914d-32a22e3a8804,\ - id=hostdev0" - -2. A vfio_ap mediated device can be hot plugged by attaching the qemu monitor - to the guest and using the following qemu monitor command: - - (qemu) device_add "vfio-ap,sysfsdev=,id=" - - For example, to plug the vfio_ap mediated device - 62177883-f1bb-47f0-914d-32a22e3a8804 into the guest with the device-id - hostdev0: - - (QEMU) device-add "vfio-ap,\ - sysfsdev=/sys/devices/vfio_ap/matrix/62177883-f1bb-47f0-914d-32a22e3a8804,\ - id=hostdev0" diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst deleted file mode 100644 index 37026fa18179..000000000000 --- a/Documentation/s390/vfio-ccw.rst +++ /dev/null @@ -1,445 +0,0 @@ -================================== -vfio-ccw: the basic infrastructure -================================== - -Introduction ------------- - -Here we describe the vfio support for I/O subchannel devices for -Linux/s390. Motivation for vfio-ccw is to passthrough subchannels to a -virtual machine, while vfio is the means. - -Different than other hardware architectures, s390 has defined a unified -I/O access method, which is so called Channel I/O. It has its own access -patterns: - -- Channel programs run asynchronously on a separate (co)processor. -- The channel subsystem will access any memory designated by the caller - in the channel program directly, i.e. there is no iommu involved. - -Thus when we introduce vfio support for these devices, we realize it -with a mediated device (mdev) implementation. The vfio mdev will be -added to an iommu group, so as to make itself able to be managed by the -vfio framework. And we add read/write callbacks for special vfio I/O -regions to pass the channel programs from the mdev to its parent device -(the real I/O subchannel device) to do further address translation and -to perform I/O instructions. - -This document does not intend to explain the s390 I/O architecture in -every detail. More information/reference could be found here: - -- A good start to know Channel I/O in general: - https://en.wikipedia.org/wiki/Channel_I/O -- s390 architecture: - s390 Principles of Operation manual (IBM Form. No. SA22-7832) -- The existing QEMU code which implements a simple emulated channel - subsystem could also be a good reference. It makes it easier to follow - the flow. - qemu/hw/s390x/css.c - -For vfio mediated device framework: -- Documentation/driver-api/vfio-mediated-device.rst - -Motivation of vfio-ccw ----------------------- - -Typically, a guest virtualized via QEMU/KVM on s390 only sees -paravirtualized virtio devices via the "Virtio Over Channel I/O -(virtio-ccw)" transport. This makes virtio devices discoverable via -standard operating system algorithms for handling channel devices. - -However this is not enough. On s390 for the majority of devices, which -use the standard Channel I/O based mechanism, we also need to provide -the functionality of passing through them to a QEMU virtual machine. -This includes devices that don't have a virtio counterpart (e.g. tape -drives) or that have specific characteristics which guests want to -exploit. - -For passing a device to a guest, we want to use the same interface as -everybody else, namely vfio. We implement this vfio support for channel -devices via the vfio mediated device framework and the subchannel device -driver "vfio_ccw". - -Access patterns of CCW devices ------------------------------- - -s390 architecture has implemented a so called channel subsystem, that -provides a unified view of the devices physically attached to the -systems. Though the s390 hardware platform knows about a huge variety of -different peripheral attachments like disk devices (aka. DASDs), tapes, -communication controllers, etc. They can all be accessed by a well -defined access method and they are presenting I/O completion a unified -way: I/O interruptions. - -All I/O requires the use of channel command words (CCWs). A CCW is an -instruction to a specialized I/O channel processor. A channel program is -a sequence of CCWs which are executed by the I/O channel subsystem. To -issue a channel program to the channel subsystem, it is required to -build an operation request block (ORB), which can be used to point out -the format of the CCW and other control information to the system. The -operating system signals the I/O channel subsystem to begin executing -the channel program with a SSCH (start sub-channel) instruction. The -central processor is then free to proceed with non-I/O instructions -until interrupted. The I/O completion result is received by the -interrupt handler in the form of interrupt response block (IRB). - -Back to vfio-ccw, in short: - -- ORBs and channel programs are built in guest kernel (with guest - physical addresses). -- ORBs and channel programs are passed to the host kernel. -- Host kernel translates the guest physical addresses to real addresses - and starts the I/O with issuing a privileged Channel I/O instruction - (e.g SSCH). -- channel programs run asynchronously on a separate processor. -- I/O completion will be signaled to the host with I/O interruptions. - And it will be copied as IRB to user space to pass it back to the - guest. - -Physical vfio ccw device and its child mdev -------------------------------------------- - -As mentioned above, we realize vfio-ccw with a mdev implementation. - -Channel I/O does not have IOMMU hardware support, so the physical -vfio-ccw device does not have an IOMMU level translation or isolation. - -Subchannel I/O instructions are all privileged instructions. When -handling the I/O instruction interception, vfio-ccw has the software -policing and translation how the channel program is programmed before -it gets sent to hardware. - -Within this implementation, we have two drivers for two types of -devices: - -- The vfio_ccw driver for the physical subchannel device. - This is an I/O subchannel driver for the real subchannel device. It - realizes a group of callbacks and registers to the mdev framework as a - parent (physical) device. As a consequence, mdev provides vfio_ccw a - generic interface (sysfs) to create mdev devices. A vfio mdev could be - created by vfio_ccw then and added to the mediated bus. It is the vfio - device that added to an IOMMU group and a vfio group. - vfio_ccw also provides an I/O region to accept channel program - request from user space and store I/O interrupt result for user - space to retrieve. To notify user space an I/O completion, it offers - an interface to setup an eventfd fd for asynchronous signaling. - -- The vfio_mdev driver for the mediated vfio ccw device. - This is provided by the mdev framework. It is a vfio device driver for - the mdev that created by vfio_ccw. - It realizes a group of vfio device driver callbacks, adds itself to a - vfio group, and registers itself to the mdev framework as a mdev - driver. - It uses a vfio iommu backend that uses the existing map and unmap - ioctls, but rather than programming them into an IOMMU for a device, - it simply stores the translations for use by later requests. This - means that a device programmed in a VM with guest physical addresses - can have the vfio kernel convert that address to process virtual - address, pin the page and program the hardware with the host physical - address in one step. - For a mdev, the vfio iommu backend will not pin the pages during the - VFIO_IOMMU_MAP_DMA ioctl. Mdev framework will only maintain a database - of the iova<->vaddr mappings in this operation. And they export a - vfio_pin_pages and a vfio_unpin_pages interfaces from the vfio iommu - backend for the physical devices to pin and unpin pages by demand. - -Below is a high Level block diagram:: - - +-------------+ - | | - | +---------+ | mdev_register_driver() +--------------+ - | | Mdev | +<-----------------------+ | - | | bus | | | vfio_mdev.ko | - | | driver | +----------------------->+ |<-> VFIO user - | +---------+ | probe()/remove() +--------------+ APIs - | | - | MDEV CORE | - | MODULE | - | mdev.ko | - | +---------+ | mdev_register_parent() +--------------+ - | |Physical | +<-----------------------+ | - | | device | | | vfio_ccw.ko |<-> subchannel - | |interface| +----------------------->+ | device - | +---------+ | callback +--------------+ - +-------------+ - -The process of how these work together. - -1. vfio_ccw.ko drives the physical I/O subchannel, and registers the - physical device (with callbacks) to mdev framework. - When vfio_ccw probing the subchannel device, it registers device - pointer and callbacks to the mdev framework. Mdev related file nodes - under the device node in sysfs would be created for the subchannel - device, namely 'mdev_create', 'mdev_destroy' and - 'mdev_supported_types'. -2. Create a mediated vfio ccw device. - Use the 'mdev_create' sysfs file, we need to manually create one (and - only one for our case) mediated device. -3. vfio_mdev.ko drives the mediated ccw device. - vfio_mdev is also the vfio device driver. It will probe the mdev and - add it to an iommu_group and a vfio_group. Then we could pass through - the mdev to a guest. - - -VFIO-CCW Regions ----------------- - -The vfio-ccw driver exposes MMIO regions to accept requests from and return -results to userspace. - -vfio-ccw I/O region -------------------- - -An I/O region is used to accept channel program request from user -space and store I/O interrupt result for user space to retrieve. The -definition of the region is:: - - struct ccw_io_region { - #define ORB_AREA_SIZE 12 - __u8 orb_area[ORB_AREA_SIZE]; - #define SCSW_AREA_SIZE 12 - __u8 scsw_area[SCSW_AREA_SIZE]; - #define IRB_AREA_SIZE 96 - __u8 irb_area[IRB_AREA_SIZE]; - __u32 ret_code; - } __packed; - -This region is always available. - -While starting an I/O request, orb_area should be filled with the -guest ORB, and scsw_area should be filled with the SCSW of the Virtual -Subchannel. - -irb_area stores the I/O result. - -ret_code stores a return code for each access of the region. The following -values may occur: - -``0`` - The operation was successful. - -``-EOPNOTSUPP`` - The ORB specified transport mode or the - SCSW specified a function other than the start function. - -``-EIO`` - A request was issued while the device was not in a state ready to accept - requests, or an internal error occurred. - -``-EBUSY`` - The subchannel was status pending or busy, or a request is already active. - -``-EAGAIN`` - A request was being processed, and the caller should retry. - -``-EACCES`` - The channel path(s) used for the I/O were found to be not operational. - -``-ENODEV`` - The device was found to be not operational. - -``-EINVAL`` - The orb specified a chain longer than 255 ccws, or an internal error - occurred. - - -vfio-ccw cmd region -------------------- - -The vfio-ccw cmd region is used to accept asynchronous instructions -from userspace:: - - #define VFIO_CCW_ASYNC_CMD_HSCH (1 << 0) - #define VFIO_CCW_ASYNC_CMD_CSCH (1 << 1) - struct ccw_cmd_region { - __u32 command; - __u32 ret_code; - } __packed; - -This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD. - -Currently, CLEAR SUBCHANNEL and HALT SUBCHANNEL use this region. - -command specifies the command to be issued; ret_code stores a return code -for each access of the region. The following values may occur: - -``0`` - The operation was successful. - -``-ENODEV`` - The device was found to be not operational. - -``-EINVAL`` - A command other than halt or clear was specified. - -``-EIO`` - A request was issued while the device was not in a state ready to accept - requests. - -``-EAGAIN`` - A request was being processed, and the caller should retry. - -``-EBUSY`` - The subchannel was status pending or busy while processing a halt request. - -vfio-ccw schib region ---------------------- - -The vfio-ccw schib region is used to return Subchannel-Information -Block (SCHIB) data to userspace:: - - struct ccw_schib_region { - #define SCHIB_AREA_SIZE 52 - __u8 schib_area[SCHIB_AREA_SIZE]; - } __packed; - -This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_SCHIB. - -Reading this region triggers a STORE SUBCHANNEL to be issued to the -associated hardware. - -vfio-ccw crw region ---------------------- - -The vfio-ccw crw region is used to return Channel Report Word (CRW) -data to userspace:: - - struct ccw_crw_region { - __u32 crw; - __u32 pad; - } __packed; - -This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_CRW. - -Reading this region returns a CRW if one that is relevant for this -subchannel (e.g. one reporting changes in channel path state) is -pending, or all zeroes if not. If multiple CRWs are pending (including -possibly chained CRWs), reading this region again will return the next -one, until no more CRWs are pending and zeroes are returned. This is -similar to how STORE CHANNEL REPORT WORD works. - -vfio-ccw operation details --------------------------- - -vfio-ccw follows what vfio-pci did on the s390 platform and uses -vfio-iommu-type1 as the vfio iommu backend. - -* CCW translation APIs - A group of APIs (start with `cp_`) to do CCW translation. The CCWs - passed in by a user space program are organized with their guest - physical memory addresses. These APIs will copy the CCWs into kernel - space, and assemble a runnable kernel channel program by updating the - guest physical addresses with their corresponding host physical addresses. - Note that we have to use IDALs even for direct-access CCWs, as the - referenced memory can be located anywhere, including above 2G. - -* vfio_ccw device driver - This driver utilizes the CCW translation APIs and introduces - vfio_ccw, which is the driver for the I/O subchannel devices you want - to pass through. - vfio_ccw implements the following vfio ioctls:: - - VFIO_DEVICE_GET_INFO - VFIO_DEVICE_GET_IRQ_INFO - VFIO_DEVICE_GET_REGION_INFO - VFIO_DEVICE_RESET - VFIO_DEVICE_SET_IRQS - - This provides an I/O region, so that the user space program can pass a - channel program to the kernel, to do further CCW translation before - issuing them to a real device. - This also provides the SET_IRQ ioctl to setup an event notifier to - notify the user space program the I/O completion in an asynchronous - way. - -The use of vfio-ccw is not limited to QEMU, while QEMU is definitely a -good example to get understand how these patches work. Here is a little -bit more detail how an I/O request triggered by the QEMU guest will be -handled (without error handling). - -Explanation: - -- Q1-Q7: QEMU side process. -- K1-K5: Kernel side process. - -Q1. - Get I/O region info during initialization. - -Q2. - Setup event notifier and handler to handle I/O completion. - -... ... - -Q3. - Intercept a ssch instruction. -Q4. - Write the guest channel program and ORB to the I/O region. - - K1. - Copy from guest to kernel. - K2. - Translate the guest channel program to a host kernel space - channel program, which becomes runnable for a real device. - K3. - With the necessary information contained in the orb passed in - by QEMU, issue the ccwchain to the device. - K4. - Return the ssch CC code. -Q5. - Return the CC code to the guest. - -... ... - - K5. - Interrupt handler gets the I/O result and write the result to - the I/O region. - K6. - Signal QEMU to retrieve the result. - -Q6. - Get the signal and event handler reads out the result from the I/O - region. -Q7. - Update the irb for the guest. - -Limitations ------------ - -The current vfio-ccw implementation focuses on supporting basic commands -needed to implement block device functionality (read/write) of DASD/ECKD -device only. Some commands may need special handling in the future, for -example, anything related to path grouping. - -DASD is a kind of storage device. While ECKD is a data recording format. -More information for DASD and ECKD could be found here: -https://en.wikipedia.org/wiki/Direct-access_storage_device -https://en.wikipedia.org/wiki/Count_key_data - -Together with the corresponding work in QEMU, we can bring the passed -through DASD/ECKD device online in a guest now and use it as a block -device. - -The current code allows the guest to start channel programs via -START SUBCHANNEL, and to issue HALT SUBCHANNEL, CLEAR SUBCHANNEL, -and STORE SUBCHANNEL. - -Currently all channel programs are prefetched, regardless of the -p-bit setting in the ORB. As a result, self modifying channel -programs are not supported. For this reason, IPL has to be handled as -a special case by a userspace/guest program; this has been implemented -in QEMU's s390-ccw bios as of QEMU 4.1. - -vfio-ccw supports classic (command mode) channel I/O only. Transport -mode (HPF) is not supported. - -QDIO subchannels are currently not supported. Classic devices other than -DASD/ECKD might work, but have not been tested. - -Reference ---------- -1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832) -2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204) -3. https://en.wikipedia.org/wiki/Channel_I/O -4. Documentation/s390/cds.rst -5. Documentation/driver-api/vfio.rst -6. Documentation/driver-api/vfio-mediated-device.rst diff --git a/Documentation/s390/zfcpdump.rst b/Documentation/s390/zfcpdump.rst deleted file mode 100644 index a61de7aa8778..000000000000 --- a/Documentation/s390/zfcpdump.rst +++ /dev/null @@ -1,50 +0,0 @@ -================================== -The s390 SCSI dump tool (zfcpdump) -================================== - -System z machines (z900 or higher) provide hardware support for creating system -dumps on SCSI disks. The dump process is initiated by booting a dump tool, which -has to create a dump of the current (probably crashed) Linux image. In order to -not overwrite memory of the crashed Linux with data of the dump tool, the -hardware saves some memory plus the register sets of the boot CPU before the -dump tool is loaded. There exists an SCLP hardware interface to obtain the saved -memory afterwards. Currently 32 MB are saved. - -This zfcpdump implementation consists of a Linux dump kernel together with -a user space dump tool, which are loaded together into the saved memory region -below 32 MB. zfcpdump is installed on a SCSI disk using zipl (as contained in -the s390-tools package) to make the device bootable. The operator of a Linux -system can then trigger a SCSI dump by booting the SCSI disk, where zfcpdump -resides on. - -The user space dump tool accesses the memory of the crashed system by means -of the /proc/vmcore interface. This interface exports the crashed system's -memory and registers in ELF core dump format. To access the memory which has -been saved by the hardware SCLP requests will be created at the time the data -is needed by /proc/vmcore. The tail part of the crashed systems memory which -has not been stashed by hardware can just be copied from real memory. - -To build a dump enabled kernel the kernel config option CONFIG_CRASH_DUMP -has to be set. - -To get a valid zfcpdump kernel configuration use "make zfcpdump_defconfig". - -The s390 zipl tool looks for the zfcpdump kernel and optional initrd/initramfs -under the following locations: - -* kernel: /zfcpdump.image -* ramdisk: /zfcpdump.rd - -The zfcpdump directory is defined in the s390-tools package. - -The user space application of zfcpdump can reside in an intitramfs or an -initrd. It can also be included in a built-in kernel initramfs. The application -reads from /proc/vmcore or zcore/mem and writes the system dump to a SCSI disk. - -The s390-tools package version 1.24.0 and above builds an external zfcpdump -initramfs with a user space application that writes the dump to a SCSI -partition. - -For more information on how to use zfcpdump refer to the s390 'Using the Dump -Tools' book, which is available from IBM Knowledge Center: -https://www.ibm.com/support/knowledgecenter/linuxonibm/liaaf/lnz_r_dt.html diff --git a/MAINTAINERS b/MAINTAINERS index d516295978a4..262736152862 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18596,7 +18596,7 @@ L: linux-s390@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git F: Documentation/driver-api/s390-drivers.rst -F: Documentation/s390/ +F: Documentation/arch/s390/ F: arch/s390/ F: drivers/s390/ F: drivers/watchdog/diag288_wdt.c @@ -18657,7 +18657,7 @@ M: Niklas Schnelle M: Gerald Schaefer L: linux-s390@vger.kernel.org S: Supported -F: Documentation/s390/pci.rst +F: Documentation/arch/s390/pci.rst F: arch/s390/pci/ F: drivers/pci/hotplug/s390_pci_hpc.c @@ -18674,7 +18674,7 @@ M: Halil Pasic M: Jason Herne L: linux-s390@vger.kernel.org S: Supported -F: Documentation/s390/vfio-ap* +F: Documentation/arch/s390/vfio-ap* F: drivers/s390/crypto/vfio_ap* S390 VFIO-CCW DRIVER @@ -18684,7 +18684,7 @@ R: Halil Pasic L: linux-s390@vger.kernel.org L: kvm@vger.kernel.org S: Supported -F: Documentation/s390/vfio-ccw.rst +F: Documentation/arch/s390/vfio-ccw.rst F: drivers/s390/cio/vfio_ccw* F: include/uapi/linux/vfio_ccw.h diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c5be7199067b..d9d50a7a2016 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -754,9 +754,9 @@ config CRASH_DUMP Crash dump kernels are loaded in the main kernel with kexec-tools into a specially reserved region and then later executed after a crash by kdump/kexec. - Refer to for more details on this. + Refer to for more details on this. This option also enables s390 zfcpdump. - See also + See also endmenu diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index ac665b9670c5..ccd4e148b5ed 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -222,7 +222,7 @@ static inline debug_entry_t *debug_text_event(debug_info_t *id, int level, /* * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are - * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details! + * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details! */ extern debug_entry_t * __debug_sprintf_event(debug_info_t *id, int level, char *string, ...) @@ -350,7 +350,7 @@ static inline debug_entry_t *debug_text_exception(debug_info_t *id, int level, /* * IMPORTANT: Use "%s" in sprintf format strings with care! Only pointers are - * stored in the s390dbf. See Documentation/s390/s390dbf.rst for more details! + * stored in the s390dbf. See Documentation/arch/s390/s390dbf.rst for more details! */ extern debug_entry_t * __debug_sprintf_exception(debug_info_t *id, int level, char *string, ...) diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c index 942c73a11ca3..bc3be0330f1d 100644 --- a/drivers/s390/char/zcore.c +++ b/drivers/s390/char/zcore.c @@ -3,7 +3,7 @@ * zcore module to export memory content and register sets for creating system * dumps on SCSI/NVMe disks (zfcp/nvme dump). * - * For more information please refer to Documentation/s390/zfcpdump.rst + * For more information please refer to Documentation/arch/s390/zfcpdump.rst * * Copyright IBM Corp. 2003, 2008 * Author(s): Michael Holzheu -- cgit v1.2.3 From 305b9f4f7bebc12610035f8cd865a0db87df81b6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 22 Jul 2023 02:13:58 +0900 Subject: s390: use obj-y to descend into drivers/s390/ The single build rule does not work with the drivers-y syntax. [1] Use the standard obj-y syntax. It moves the objects from drivers/s390/ to slightly lower address, but fixes the reported issue. [1]: https://lore.kernel.org/linux-kbuild/d57ba55f-20a3-b836-783d-b49c8a161b6e@kernel.org/T/#m27f781ab60acadfed8a9e9642f30d5414a5e2df3 Signed-off-by: Masahiro Yamada Tested-by: Jiri Slaby Link: https://lore.kernel.org/r/20230721171358.3612099-1-masahiroy@kernel.org Signed-off-by: Heiko Carstens --- arch/s390/Makefile | 1 - drivers/Makefile | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 5ed242897b0d..a53a36ee0731 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -119,7 +119,6 @@ export KBUILD_CFLAGS_DECOMPRESSOR OBJCOPYFLAGS := -O binary libs-y += arch/s390/lib/ -drivers-y += drivers/s390/ boot := arch/s390/boot syscalls := arch/s390/kernel/syscalls diff --git a/drivers/Makefile b/drivers/Makefile index 7241d80a7b29..a7459e77df37 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -195,3 +195,5 @@ obj-$(CONFIG_PECI) += peci/ obj-$(CONFIG_HTE) += hte/ obj-$(CONFIG_DRM_ACCEL) += accel/ obj-$(CONFIG_CDX_BUS) += cdx/ + +obj-$(CONFIG_S390) += s390/ -- cgit v1.2.3 From e810487385de409e826f249fcce13105b5513a65 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 24 Jul 2023 18:24:27 +0200 Subject: s390/diag: fix diagnose 8c description The comment above diag8c() describes diagnose 210, not diagnose 8c. Add a proper short description. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens --- arch/s390/kernel/diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index f287713baf6d..f9f06cd8fcee 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -223,7 +223,7 @@ int diag210(struct diag210 *addr) EXPORT_SYMBOL(diag210); /* - * Diagnose 210: Get information about a virtual device + * Diagnose 8C: Access 3270 Display Device Information */ int diag8c(struct diag8c *addr, struct ccw_dev_id *devno) { -- cgit v1.2.3 From 7fb0ad1938ef129aa39d565bf2ad3728a39136ef Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 10 Jun 2023 16:25:28 +0200 Subject: s390/ebcdic: fix typo in comment s/ECBDIC/EBCDIC/ (C and B are swapped) Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/08ed63331699177b3354458da66a2f63c0217e49.1686407113.git.christophe.jaillet@wanadoo.fr Signed-off-by: Heiko Carstens --- arch/s390/kernel/ebcdic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/ebcdic.c b/arch/s390/kernel/ebcdic.c index 7f8246c9be08..0e51fa537262 100644 --- a/arch/s390/kernel/ebcdic.c +++ b/arch/s390/kernel/ebcdic.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * ECBDIC -> ASCII, ASCII -> ECBDIC, + * EBCDIC -> ASCII, ASCII -> EBCDIC, * upper to lower case (EBCDIC) conversion tables. * * S390 version -- cgit v1.2.3 From 7b27d9ef0f63da736d3a585a5d5098cea62a3ad7 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 26 Jul 2023 08:18:34 +0200 Subject: s390/ftrace: use la instead of aghik in return_to_handler() Nathan Chancellor reported the following build error when compiling the kernel with CONFIG_MARCH_Z10=y: arch/s390/kernel/mcount.S: Assembler messages: arch/s390/kernel/mcount.S:140: Error: Unrecognized opcode: `aghik' The aghik instruction is only available since z196. Use the la instruction instead which is available for all machines. Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/all/20230725211105.GA224840@dev-arch.thelio-3990X Fixes: 1256e70a082a ("s390/ftrace: enable HAVE_FUNCTION_GRAPH_RETVAL") Reviewed-by: Sven Schnelle Tested-by: Nathan Chancellor # build Link: https://lore.kernel.org/r/20230726061834.1300984-1-hca@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/kernel/mcount.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index d2596e0df6fa..71c5fa05e7f1 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -137,7 +137,7 @@ SYM_FUNC_START(return_to_handler) lgr %r1,%r15 aghi %r15,-(STACK_FRAME_OVERHEAD+__FGRAPH_RET_SIZE) stg %r1,__SF_BACKCHAIN(%r15) - aghik %r3,%r15,STACK_FRAME_OVERHEAD + la %r3,STACK_FRAME_OVERHEAD(%r15) stg %r1,__FGRAPH_RET_FP(%r3) stg %r2,__FGRAPH_RET_GPR2(%r3) lgr %r2,%r3 -- cgit v1.2.3 From 8b46451c8bd63da543598ef8e0d67cb52281c6ef Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 26 Jul 2023 15:39:39 +0200 Subject: s390/defconfigs: set CONFIG_FUNCTION_GRAPH_RETVAL=y Enable recording and printing function return values for the function graph tracer. Signed-off-by: Heiko Carstens --- arch/s390/configs/debug_defconfig | 1 + arch/s390/configs/defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index aa95cf6dfabb..b042ccb8a8a6 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -836,6 +836,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=300 # CONFIG_RCU_TRACE is not set CONFIG_LATENCYTOP=y CONFIG_BOOTTIME_TRACING=y +CONFIG_FUNCTION_GRAPH_RETVAL=y CONFIG_FPROBE=y CONFIG_FUNCTION_PROFILER=y CONFIG_STACK_TRACER=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index f041945f9148..0fa45c0d6bee 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -787,6 +787,7 @@ CONFIG_RCU_REF_SCALE_TEST=m CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_LATENCYTOP=y CONFIG_BOOTTIME_TRACING=y +CONFIG_FUNCTION_GRAPH_RETVAL=y CONFIG_FPROBE=y CONFIG_FUNCTION_PROFILER=y CONFIG_STACK_TRACER=y -- cgit v1.2.3 From c28c07fe235ccaafe11003393de064b2a24dd2e3 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 24 Jul 2023 17:20:26 +0200 Subject: s390/mm: move pfault code to own C file The pfault code has nothing to do with regular fault handling. Therefore move it to an own C file. Also add an own pfault header file. This way changes to setup.h don't cause a recompile of the pfault code and vice versa. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens --- arch/s390/include/asm/pfault.h | 24 ++++ arch/s390/include/asm/setup.h | 8 -- arch/s390/kernel/machine_kexec.c | 1 + arch/s390/kernel/smp.c | 1 + arch/s390/mm/Makefile | 1 + arch/s390/mm/fault.c | 228 ------------------------------------- arch/s390/mm/pfault.c | 239 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 266 insertions(+), 236 deletions(-) create mode 100644 arch/s390/include/asm/pfault.h create mode 100644 arch/s390/mm/pfault.c diff --git a/arch/s390/include/asm/pfault.h b/arch/s390/include/asm/pfault.h new file mode 100644 index 000000000000..beabeebf2859 --- /dev/null +++ b/arch/s390/include/asm/pfault.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright IBM Corp. 1999, 2023 + */ +#ifndef _ASM_S390_PFAULT_H +#define _ASM_S390_PFAULT_H + +int __pfault_init(void); +void __pfault_fini(void); + +static inline int pfault_init(void) +{ + if (IS_ENABLED(CONFIG_PFAULT)) + return __pfault_init(); + return -1; +} + +static inline void pfault_fini(void) +{ + if (IS_ENABLED(CONFIG_PFAULT)) + __pfault_fini(); +} + +#endif /* _ASM_S390_PFAULT_H */ diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index e795f425627a..b30fe91166e3 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -118,14 +118,6 @@ extern unsigned int console_irq; #define SET_CONSOLE_VT220 do { console_mode = 4; } while (0) #define SET_CONSOLE_HVC do { console_mode = 5; } while (0) -#ifdef CONFIG_PFAULT -extern int pfault_init(void); -extern void pfault_fini(void); -#else /* CONFIG_PFAULT */ -#define pfault_init() ({-1;}) -#define pfault_fini() do { } while (0) -#endif /* CONFIG_PFAULT */ - #ifdef CONFIG_VMCP void vmcp_cma_reserve(void); #else diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 6d9276c096a6..12a2bd4fc88c 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index f9a2b755f510..9244130721d6 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index d90db06a8af5..352ff520fd94 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o obj-$(CONFIG_PGSTE) += gmap.o +obj-$(CONFIG_PFAULT) += pfault.o diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 2f123429a291..b5e1bea9194c 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -43,8 +43,6 @@ #include "../kernel/entry.h" #define __FAIL_ADDR_MASK -4096L -#define __SUBCODE_MASK 0x0600 -#define __PF_RES_FIELD 0x8000000000000000ULL /* * Allocate private vm_fault_reason from top. Please make sure it won't @@ -583,232 +581,6 @@ void do_dat_exception(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_dat_exception); -#ifdef CONFIG_PFAULT -/* - * 'pfault' pseudo page faults routines. - */ -static int pfault_disable; - -static int __init nopfault(char *str) -{ - pfault_disable = 1; - return 1; -} - -__setup("nopfault", nopfault); - -struct pfault_refbk { - u16 refdiagc; - u16 reffcode; - u16 refdwlen; - u16 refversn; - u64 refgaddr; - u64 refselmk; - u64 refcmpmk; - u64 reserved; -} __attribute__ ((packed, aligned(8))); - -static struct pfault_refbk pfault_init_refbk = { - .refdiagc = 0x258, - .reffcode = 0, - .refdwlen = 5, - .refversn = 2, - .refgaddr = __LC_LPP, - .refselmk = 1ULL << 48, - .refcmpmk = 1ULL << 48, - .reserved = __PF_RES_FIELD -}; - -int pfault_init(void) -{ - int rc; - - if (pfault_disable) - return -1; - diag_stat_inc(DIAG_STAT_X258); - asm volatile( - " diag %1,%0,0x258\n" - "0: j 2f\n" - "1: la %0,8\n" - "2:\n" - EX_TABLE(0b,1b) - : "=d" (rc) - : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc"); - return rc; -} - -static struct pfault_refbk pfault_fini_refbk = { - .refdiagc = 0x258, - .reffcode = 1, - .refdwlen = 5, - .refversn = 2, -}; - -void pfault_fini(void) -{ - - if (pfault_disable) - return; - diag_stat_inc(DIAG_STAT_X258); - asm volatile( - " diag %0,0,0x258\n" - "0: nopr %%r7\n" - EX_TABLE(0b,0b) - : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc"); -} - -static DEFINE_SPINLOCK(pfault_lock); -static LIST_HEAD(pfault_list); - -#define PF_COMPLETE 0x0080 - -/* - * The mechanism of our pfault code: if Linux is running as guest, runs a user - * space process and the user space process accesses a page that the host has - * paged out we get a pfault interrupt. - * - * This allows us, within the guest, to schedule a different process. Without - * this mechanism the host would have to suspend the whole virtual cpu until - * the page has been paged in. - * - * So when we get such an interrupt then we set the state of the current task - * to uninterruptible and also set the need_resched flag. Both happens within - * interrupt context(!). If we later on want to return to user space we - * recognize the need_resched flag and then call schedule(). It's not very - * obvious how this works... - * - * Of course we have a lot of additional fun with the completion interrupt (-> - * host signals that a page of a process has been paged in and the process can - * continue to run). This interrupt can arrive on any cpu and, since we have - * virtual cpus, actually appear before the interrupt that signals that a page - * is missing. - */ -static void pfault_interrupt(struct ext_code ext_code, - unsigned int param32, unsigned long param64) -{ - struct task_struct *tsk; - __u16 subcode; - pid_t pid; - - /* - * Get the external interruption subcode & pfault initial/completion - * signal bit. VM stores this in the 'cpu address' field associated - * with the external interrupt. - */ - subcode = ext_code.subcode; - if ((subcode & 0xff00) != __SUBCODE_MASK) - return; - inc_irq_stat(IRQEXT_PFL); - /* Get the token (= pid of the affected task). */ - pid = param64 & LPP_PID_MASK; - rcu_read_lock(); - tsk = find_task_by_pid_ns(pid, &init_pid_ns); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); - if (!tsk) - return; - spin_lock(&pfault_lock); - if (subcode & PF_COMPLETE) { - /* signal bit is set -> a page has been swapped in by VM */ - if (tsk->thread.pfault_wait == 1) { - /* Initial interrupt was faster than the completion - * interrupt. pfault_wait is valid. Set pfault_wait - * back to zero and wake up the process. This can - * safely be done because the task is still sleeping - * and can't produce new pfaults. */ - tsk->thread.pfault_wait = 0; - list_del(&tsk->thread.list); - wake_up_process(tsk); - put_task_struct(tsk); - } else { - /* Completion interrupt was faster than initial - * interrupt. Set pfault_wait to -1 so the initial - * interrupt doesn't put the task to sleep. - * If the task is not running, ignore the completion - * interrupt since it must be a leftover of a PFAULT - * CANCEL operation which didn't remove all pending - * completion interrupts. */ - if (task_is_running(tsk)) - tsk->thread.pfault_wait = -1; - } - } else { - /* signal bit not set -> a real page is missing. */ - if (WARN_ON_ONCE(tsk != current)) - goto out; - if (tsk->thread.pfault_wait == 1) { - /* Already on the list with a reference: put to sleep */ - goto block; - } else if (tsk->thread.pfault_wait == -1) { - /* Completion interrupt was faster than the initial - * interrupt (pfault_wait == -1). Set pfault_wait - * back to zero and exit. */ - tsk->thread.pfault_wait = 0; - } else { - /* Initial interrupt arrived before completion - * interrupt. Let the task sleep. - * An extra task reference is needed since a different - * cpu may set the task state to TASK_RUNNING again - * before the scheduler is reached. */ - get_task_struct(tsk); - tsk->thread.pfault_wait = 1; - list_add(&tsk->thread.list, &pfault_list); -block: - /* Since this must be a userspace fault, there - * is no kernel task state to trample. Rely on the - * return to userspace schedule() to block. */ - __set_current_state(TASK_UNINTERRUPTIBLE); - set_tsk_need_resched(tsk); - set_preempt_need_resched(); - } - } -out: - spin_unlock(&pfault_lock); - put_task_struct(tsk); -} - -static int pfault_cpu_dead(unsigned int cpu) -{ - struct thread_struct *thread, *next; - struct task_struct *tsk; - - spin_lock_irq(&pfault_lock); - list_for_each_entry_safe(thread, next, &pfault_list, list) { - thread->pfault_wait = 0; - list_del(&thread->list); - tsk = container_of(thread, struct task_struct, thread); - wake_up_process(tsk); - put_task_struct(tsk); - } - spin_unlock_irq(&pfault_lock); - return 0; -} - -static int __init pfault_irq_init(void) -{ - int rc; - - rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); - if (rc) - goto out_extint; - rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; - if (rc) - goto out_pfault; - irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); - cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", - NULL, pfault_cpu_dead); - return 0; - -out_pfault: - unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); -out_extint: - pfault_disable = 1; - return rc; -} -early_initcall(pfault_irq_init); - -#endif /* CONFIG_PFAULT */ - #if IS_ENABLED(CONFIG_PGSTE) void do_secure_storage_access(struct pt_regs *regs) diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c new file mode 100644 index 000000000000..5c0547f8d5ee --- /dev/null +++ b/arch/s390/mm/pfault.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 1999, 2023 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define __SUBCODE_MASK 0x0600 +#define __PF_RES_FIELD 0x8000000000000000ULL + +/* + * 'pfault' pseudo page faults routines. + */ +static int pfault_disable; + +static int __init nopfault(char *str) +{ + pfault_disable = 1; + return 1; +} + +__setup("nopfault", nopfault); + +struct pfault_refbk { + u16 refdiagc; + u16 reffcode; + u16 refdwlen; + u16 refversn; + u64 refgaddr; + u64 refselmk; + u64 refcmpmk; + u64 reserved; +} __attribute__ ((packed, aligned(8))); + +static struct pfault_refbk pfault_init_refbk = { + .refdiagc = 0x258, + .reffcode = 0, + .refdwlen = 5, + .refversn = 2, + .refgaddr = __LC_LPP, + .refselmk = 1ULL << 48, + .refcmpmk = 1ULL << 48, + .reserved = __PF_RES_FIELD +}; + +int __pfault_init(void) +{ + int rc; + + if (pfault_disable) + return -1; + diag_stat_inc(DIAG_STAT_X258); + asm volatile( + " diag %1,%0,0x258\n" + "0: j 2f\n" + "1: la %0,8\n" + "2:\n" + EX_TABLE(0b,1b) + : "=d" (rc) + : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc"); + return rc; +} + +static struct pfault_refbk pfault_fini_refbk = { + .refdiagc = 0x258, + .reffcode = 1, + .refdwlen = 5, + .refversn = 2, +}; + +void __pfault_fini(void) +{ + + if (pfault_disable) + return; + diag_stat_inc(DIAG_STAT_X258); + asm volatile( + " diag %0,0,0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b,0b) + : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc"); +} + +static DEFINE_SPINLOCK(pfault_lock); +static LIST_HEAD(pfault_list); + +#define PF_COMPLETE 0x0080 + +/* + * The mechanism of our pfault code: if Linux is running as guest, runs a user + * space process and the user space process accesses a page that the host has + * paged out we get a pfault interrupt. + * + * This allows us, within the guest, to schedule a different process. Without + * this mechanism the host would have to suspend the whole virtual cpu until + * the page has been paged in. + * + * So when we get such an interrupt then we set the state of the current task + * to uninterruptible and also set the need_resched flag. Both happens within + * interrupt context(!). If we later on want to return to user space we + * recognize the need_resched flag and then call schedule(). It's not very + * obvious how this works... + * + * Of course we have a lot of additional fun with the completion interrupt (-> + * host signals that a page of a process has been paged in and the process can + * continue to run). This interrupt can arrive on any cpu and, since we have + * virtual cpus, actually appear before the interrupt that signals that a page + * is missing. + */ +static void pfault_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + struct task_struct *tsk; + __u16 subcode; + pid_t pid; + + /* + * Get the external interruption subcode & pfault initial/completion + * signal bit. VM stores this in the 'cpu address' field associated + * with the external interrupt. + */ + subcode = ext_code.subcode; + if ((subcode & 0xff00) != __SUBCODE_MASK) + return; + inc_irq_stat(IRQEXT_PFL); + /* Get the token (= pid of the affected task). */ + pid = param64 & LPP_PID_MASK; + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return; + spin_lock(&pfault_lock); + if (subcode & PF_COMPLETE) { + /* signal bit is set -> a page has been swapped in by VM */ + if (tsk->thread.pfault_wait == 1) { + /* Initial interrupt was faster than the completion + * interrupt. pfault_wait is valid. Set pfault_wait + * back to zero and wake up the process. This can + * safely be done because the task is still sleeping + * and can't produce new pfaults. */ + tsk->thread.pfault_wait = 0; + list_del(&tsk->thread.list); + wake_up_process(tsk); + put_task_struct(tsk); + } else { + /* Completion interrupt was faster than initial + * interrupt. Set pfault_wait to -1 so the initial + * interrupt doesn't put the task to sleep. + * If the task is not running, ignore the completion + * interrupt since it must be a leftover of a PFAULT + * CANCEL operation which didn't remove all pending + * completion interrupts. */ + if (task_is_running(tsk)) + tsk->thread.pfault_wait = -1; + } + } else { + /* signal bit not set -> a real page is missing. */ + if (WARN_ON_ONCE(tsk != current)) + goto out; + if (tsk->thread.pfault_wait == 1) { + /* Already on the list with a reference: put to sleep */ + goto block; + } else if (tsk->thread.pfault_wait == -1) { + /* Completion interrupt was faster than the initial + * interrupt (pfault_wait == -1). Set pfault_wait + * back to zero and exit. */ + tsk->thread.pfault_wait = 0; + } else { + /* Initial interrupt arrived before completion + * interrupt. Let the task sleep. + * An extra task reference is needed since a different + * cpu may set the task state to TASK_RUNNING again + * before the scheduler is reached. */ + get_task_struct(tsk); + tsk->thread.pfault_wait = 1; + list_add(&tsk->thread.list, &pfault_list); +block: + /* Since this must be a userspace fault, there + * is no kernel task state to trample. Rely on the + * return to userspace schedule() to block. */ + __set_current_state(TASK_UNINTERRUPTIBLE); + set_tsk_need_resched(tsk); + set_preempt_need_resched(); + } + } +out: + spin_unlock(&pfault_lock); + put_task_struct(tsk); +} + +static int pfault_cpu_dead(unsigned int cpu) +{ + struct thread_struct *thread, *next; + struct task_struct *tsk; + + spin_lock_irq(&pfault_lock); + list_for_each_entry_safe(thread, next, &pfault_list, list) { + thread->pfault_wait = 0; + list_del(&thread->list); + tsk = container_of(thread, struct task_struct, thread); + wake_up_process(tsk); + put_task_struct(tsk); + } + spin_unlock_irq(&pfault_lock); + return 0; +} + +static int __init pfault_irq_init(void) +{ + int rc; + + rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); + if (rc) + goto out_extint; + rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; + if (rc) + goto out_pfault; + irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); + cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", + NULL, pfault_cpu_dead); + return 0; + +out_pfault: + unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); +out_extint: + pfault_disable = 1; + return rc; +} +early_initcall(pfault_irq_init); -- cgit v1.2.3 From b60624bb0a94126a26b9ac9653b4668a1a70ba2a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 24 Jul 2023 17:20:27 +0200 Subject: s390/pfault: use UL instead of ULL Remove another leftover of the 31 bit area: replace the not needed "unsigned long long" suffix with "unsigned long", and stay consistent with the rest of the code. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens --- arch/s390/mm/pfault.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c index 5c0547f8d5ee..05865e5616b2 100644 --- a/arch/s390/mm/pfault.c +++ b/arch/s390/mm/pfault.c @@ -13,7 +13,7 @@ #include #define __SUBCODE_MASK 0x0600 -#define __PF_RES_FIELD 0x8000000000000000ULL +#define __PF_RES_FIELD 0x8000000000000000UL /* * 'pfault' pseudo page faults routines. @@ -45,8 +45,8 @@ static struct pfault_refbk pfault_init_refbk = { .refdwlen = 5, .refversn = 2, .refgaddr = __LC_LPP, - .refselmk = 1ULL << 48, - .refcmpmk = 1ULL << 48, + .refselmk = 1UL << 48, + .refcmpmk = 1UL << 48, .reserved = __PF_RES_FIELD }; -- cgit v1.2.3 From c5b6eef58f88810b8e641d8d32d24041fa20e016 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 24 Jul 2023 17:20:28 +0200 Subject: s390/pfault: remove not needed packed and aligned attributes struct pfault_refbk is naturally packed and aligned; remove not needed packed and aligned attributes. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens --- arch/s390/mm/pfault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c index 05865e5616b2..e1c8cc14575d 100644 --- a/arch/s390/mm/pfault.c +++ b/arch/s390/mm/pfault.c @@ -37,7 +37,7 @@ struct pfault_refbk { u64 refselmk; u64 refcmpmk; u64 reserved; -} __attribute__ ((packed, aligned(8))); +}; static struct pfault_refbk pfault_init_refbk = { .refdiagc = 0x258, -- cgit v1.2.3 From 28254f36e2944a501e8bf440193e5c4f910cf10d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 24 Jul 2023 17:20:29 +0200 Subject: s390/pfault: use early_param() instead if __setup() early_param() is the standard way of defining early kernel command line parameters. Use that instead of the old __setup() variant. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens --- arch/s390/mm/pfault.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c index e1c8cc14575d..64cc42d37c8b 100644 --- a/arch/s390/mm/pfault.c +++ b/arch/s390/mm/pfault.c @@ -25,8 +25,7 @@ static int __init nopfault(char *str) pfault_disable = 1; return 1; } - -__setup("nopfault", nopfault); +early_param("nopfault", nopfault); struct pfault_refbk { u16 refdiagc; -- cgit v1.2.3 From 4c89eb874420a94680d1b842bb9c3785997713a4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 24 Jul 2023 17:20:30 +0200 Subject: s390/pfault: cleanup inline assemblies Cleanup the pfault inline assemblies: - Use symbolic names for operands - Add extra linebreaks, and whitespace to improve readability In addition, change __pfault_init() to return -EOPNOTSUPP in case of an exception, and don't return a made up valid diag 258 return value (aka "8"). This allows to simplify the inline assembly, and makes debugging easier, in case something is broken. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens --- arch/s390/include/asm/pfault.h | 4 +++- arch/s390/mm/pfault.c | 28 ++++++++++++++-------------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/arch/s390/include/asm/pfault.h b/arch/s390/include/asm/pfault.h index beabeebf2859..a1bee4a1e470 100644 --- a/arch/s390/include/asm/pfault.h +++ b/arch/s390/include/asm/pfault.h @@ -5,6 +5,8 @@ #ifndef _ASM_S390_PFAULT_H #define _ASM_S390_PFAULT_H +#include + int __pfault_init(void); void __pfault_fini(void); @@ -12,7 +14,7 @@ static inline int pfault_init(void) { if (IS_ENABLED(CONFIG_PFAULT)) return __pfault_init(); - return -1; + return -EOPNOTSUPP; } static inline void pfault_fini(void) diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c index 64cc42d37c8b..1d65512c3351 100644 --- a/arch/s390/mm/pfault.c +++ b/arch/s390/mm/pfault.c @@ -51,20 +51,19 @@ static struct pfault_refbk pfault_init_refbk = { int __pfault_init(void) { - int rc; + int rc = -EOPNOTSUPP; if (pfault_disable) - return -1; + return rc; diag_stat_inc(DIAG_STAT_X258); asm volatile( - " diag %1,%0,0x258\n" - "0: j 2f\n" - "1: la %0,8\n" - "2:\n" - EX_TABLE(0b,1b) - : "=d" (rc) - : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc"); - return rc; + " diag %[refbk],%[rc],0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : [rc] "+d" (rc) + : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) + : "cc"); + return rc; } static struct pfault_refbk pfault_fini_refbk = { @@ -76,15 +75,16 @@ static struct pfault_refbk pfault_fini_refbk = { void __pfault_fini(void) { - if (pfault_disable) return; diag_stat_inc(DIAG_STAT_X258); asm volatile( - " diag %0,0,0x258\n" + " diag %[refbk],0,0x258\n" "0: nopr %%r7\n" - EX_TABLE(0b,0b) - : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc"); + EX_TABLE(0b, 0b) + : + : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) + : "cc"); } static DEFINE_SPINLOCK(pfault_lock); -- cgit v1.2.3 From 46a923fd86eb51acfc9e833ce0a27cc09f3e1c45 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 24 Jul 2023 17:20:31 +0200 Subject: s390/pfault: use consistent comment style Use consistent comment style within the whole pfault C code. Reviewed-by: Sven Schnelle Signed-off-by: Heiko Carstens --- arch/s390/mm/pfault.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c index 1d65512c3351..1aac13bb8f53 100644 --- a/arch/s390/mm/pfault.c +++ b/arch/s390/mm/pfault.c @@ -142,23 +142,27 @@ static void pfault_interrupt(struct ext_code ext_code, if (subcode & PF_COMPLETE) { /* signal bit is set -> a page has been swapped in by VM */ if (tsk->thread.pfault_wait == 1) { - /* Initial interrupt was faster than the completion + /* + * Initial interrupt was faster than the completion * interrupt. pfault_wait is valid. Set pfault_wait * back to zero and wake up the process. This can * safely be done because the task is still sleeping - * and can't produce new pfaults. */ + * and can't produce new pfaults. + */ tsk->thread.pfault_wait = 0; list_del(&tsk->thread.list); wake_up_process(tsk); put_task_struct(tsk); } else { - /* Completion interrupt was faster than initial + /* + * Completion interrupt was faster than initial * interrupt. Set pfault_wait to -1 so the initial * interrupt doesn't put the task to sleep. * If the task is not running, ignore the completion * interrupt since it must be a leftover of a PFAULT * CANCEL operation which didn't remove all pending - * completion interrupts. */ + * completion interrupts. + */ if (task_is_running(tsk)) tsk->thread.pfault_wait = -1; } @@ -170,23 +174,29 @@ static void pfault_interrupt(struct ext_code ext_code, /* Already on the list with a reference: put to sleep */ goto block; } else if (tsk->thread.pfault_wait == -1) { - /* Completion interrupt was faster than the initial + /* + * Completion interrupt was faster than the initial * interrupt (pfault_wait == -1). Set pfault_wait - * back to zero and exit. */ + * back to zero and exit. + */ tsk->thread.pfault_wait = 0; } else { - /* Initial interrupt arrived before completion + /* + * Initial interrupt arrived before completion * interrupt. Let the task sleep. * An extra task reference is needed since a different * cpu may set the task state to TASK_RUNNING again - * before the scheduler is reached. */ + * before the scheduler is reached. + */ get_task_struct(tsk); tsk->thread.pfault_wait = 1; list_add(&tsk->thread.list, &pfault_list); block: - /* Since this must be a userspace fault, there + /* + * Since this must be a userspace fault, there * is no kernel task state to trample. Rely on the - * return to userspace schedule() to block. */ + * return to userspace schedule() to block. + */ __set_current_state(TASK_UNINTERRUPTIBLE); set_tsk_need_resched(tsk); set_preempt_need_resched(); -- cgit v1.2.3 From 3e8fc2d492207353c5ee469241c8df36c9765471 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 28 Jul 2023 16:42:28 +0800 Subject: s390/cert_store: fix error return code in fill_cs_keyring() The 'rc' will be re-assigned to 0 after calling get_vcssb(), it needs be set to error code if create_cs_keyring() fails. [hca@linux.ibm.com: slightly changed coding style] Fixes: 8cf57d7217c3 ("s390: add support for user-defined certificates") Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20230728084228.3186083-1-yangyingliang@huawei.com Signed-off-by: Heiko Carstens --- arch/s390/kernel/cert_store.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c index 1cbeb9ce0eb1..3986a044eb36 100644 --- a/arch/s390/kernel/cert_store.c +++ b/arch/s390/kernel/cert_store.c @@ -702,6 +702,7 @@ static int fill_cs_keyring(void) if (rc) goto cleanup_keys; + rc = -ENOMEM; cs_keyring = create_cs_keyring(); if (!cs_keyring) goto cleanup_keys; -- cgit v1.2.3 From 481daa505bc3eb4ac7991e2e7a981506639935fd Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Fri, 28 Jul 2023 12:04:30 +0200 Subject: s390/cert_store: select CRYPTO_LIB_SHA256 A build failure was reported when sha256() is not present: gcc-13.1.0-nolibc/s390-linux/bin/s390-linux-ld: arch/s390/kernel/cert_store.o: in function `check_certificate_hash': arch/s390/kernel/cert_store.c:267: undefined reference to `sha256' Therefore make CONFIG_CERT_STORE select CRYPTO_LIB_SHA256. Fixes: 8cf57d7217c3 ("s390: add support for user-defined certificates") Reported-by: Randy Dunlap Closes: https://lore.kernel.org/all/8ecb57fb-4560-bdfc-9e55-63e3b0937132@infradead.org/ Signed-off-by: Sven Schnelle Tested-by: Randy Dunlap # build-tested Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20230728100430.1567328-1-svens@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d9d50a7a2016..18bf754e1fad 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -516,6 +516,7 @@ config KEXEC_SIG config CERT_STORE bool "Get user certificates via DIAG320" depends on KEYS + select CRYPTO_LIB_SHA256 help Enable this option if you want to access user-provided secure boot certificates via DIAG 0x320. -- cgit v1.2.3 From e1b9c2749af020e6c915eb07fcd53fa3a1a074e6 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 1 Aug 2023 15:05:30 +0200 Subject: s390/smp: ensure global control register contents are in sync Globally setting a bit in control registers is done with smp_ctl_set_clear_bit(). This is using on_each_cpu() to execute a function which actually sets the control register bit on each online CPU. This can be problematic since on_each_cpu() does not prevent that new CPUs come online while it is executed, which in turn means that control register updates could be missing on new CPUs. In order to prevent this problem make sure that global control register contents cannot change until new CPUs have initialized their control registers, and marked themselves online, so they are included in subsequent on_each_cpu() calls. Reviewed-by: Sven Schnelle Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/kernel/smp.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 9244130721d6..a4edb7ea66ea 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -253,8 +253,9 @@ static void pcpu_free_lowcore(struct pcpu *pcpu) static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) { - struct lowcore *lc = lowcore_ptr[cpu]; + struct lowcore *lc, *abs_lc; + lc = lowcore_ptr[cpu]; cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); lc->cpu_nr = cpu; @@ -267,7 +268,9 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) lc->machine_flags = S390_lowcore.machine_flags; lc->user_timer = lc->system_timer = lc->steal_timer = lc->avg_steal_timer = 0; - __ctl_store(lc->cregs_save_area, 0, 15); + abs_lc = get_abs_lowcore(); + memcpy(lc->cregs_save_area, abs_lc->cregs_save_area, sizeof(lc->cregs_save_area)); + put_abs_lowcore(abs_lc); lc->cregs_save_area[1] = lc->kernel_asce; lc->cregs_save_area[7] = lc->user_asce; save_access_regs((unsigned int *) lc->access_regs_save_area); @@ -607,8 +610,8 @@ void smp_ctl_set_clear_bit(int cr, int bit, bool set) ctlreg = (ctlreg & parms.andval) | parms.orval; abs_lc->cregs_save_area[cr] = ctlreg; put_abs_lowcore(abs_lc); - spin_unlock(&ctl_lock); on_each_cpu(smp_ctl_bit_callback, &parms, 1); + spin_unlock(&ctl_lock); } EXPORT_SYMBOL(smp_ctl_set_clear_bit); @@ -928,12 +931,18 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) rc = pcpu_alloc_lowcore(pcpu, cpu); if (rc) return rc; + /* + * Make sure global control register contents do not change + * until new CPU has initialized control registers. + */ + spin_lock(&ctl_lock); pcpu_prepare_secondary(pcpu, cpu); pcpu_attach_task(pcpu, tidle); pcpu_start_fn(pcpu, smp_start_secondary, NULL); /* Wait until cpu puts itself in the online & active maps */ while (!cpu_online(cpu)) cpu_relax(); + spin_unlock(&ctl_lock); return 0; } -- cgit v1.2.3 From 1e66317a7f575b07a837b7dcbf4bf636282ea8a2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 7 Aug 2023 00:16:38 +0900 Subject: s390: remove unneeded #include There is no EXPORT_SYMBOL line there, hence #include is unneeded. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20230806151641.394720-1-masahiroy@kernel.org Signed-off-by: Heiko Carstens --- arch/s390/kernel/mcount.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index 71c5fa05e7f1..ae4d4fd9afcd 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -9,8 +9,6 @@ #include #include #include -#include - #define STACK_FRAME_SIZE_PTREGS (STACK_FRAME_OVERHEAD + __PT_SIZE) #define STACK_PTREGS (STACK_FRAME_OVERHEAD) -- cgit v1.2.3 From b8c723f1e62b40af4db91c04989272fb7056b30b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 7 Aug 2023 00:16:39 +0900 Subject: s390: replace #include with #include Commit ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost") deprecated , which is now a wrapper of . Replace #include with #include . After all the lines are converted, and will be removed. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20230806151641.394720-2-masahiroy@kernel.org Signed-off-by: Heiko Carstens --- arch/s390/kernel/entry.S | 2 +- arch/s390/lib/mem.S | 2 +- arch/s390/lib/tishift.S | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index a660f4b6d654..49a11f6dd7ae 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -8,6 +8,7 @@ * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), */ +#include #include #include #include @@ -26,7 +27,6 @@ #include #include #include -#include #include _LPP_OFFSET = __LC_LPP diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S index 5a9a55de2e10..08f60a42b9a6 100644 --- a/arch/s390/lib/mem.S +++ b/arch/s390/lib/mem.S @@ -5,8 +5,8 @@ * Copyright IBM Corp. 2012 */ +#include #include -#include #include GEN_BR_THUNK %r14 diff --git a/arch/s390/lib/tishift.S b/arch/s390/lib/tishift.S index de33cf02cfd2..96214f51f49b 100644 --- a/arch/s390/lib/tishift.S +++ b/arch/s390/lib/tishift.S @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include #include #include -#include .section .noinstr.text, "ax" -- cgit v1.2.3 From ee4ac5275fd82a877e2fe6aeefcc3c4268f43388 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 7 Aug 2023 00:16:40 +0900 Subject: s390: remove All *.S files under arch/s390/ have been converted to include instead of . Remove . Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20230806151641.394720-3-masahiroy@kernel.org Signed-off-by: Heiko Carstens --- arch/s390/include/asm/Kbuild | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 1a18d7b82f86..4b904110d27c 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -5,6 +5,5 @@ generated-y += syscall_table.h generated-y += unistd_nr.h generic-y += asm-offsets.h -generic-y += export.h generic-y += kvm_types.h generic-y += mcs_spinlock.h -- cgit v1.2.3 From 8ddccc8a7d06f7ea4d8579970c95609d1b1de77b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 6 Jul 2023 12:28:17 +0200 Subject: s390/boot: cleanup number of page table levels setup The separate vmalloc area size check against _REGION2_SIZE is needed in case user provided insanely large value using vmalloc= kernel command line parameter. That could lead to overflow and selecting 3 page table levels instead of 4. Use size_add() for the overflow check and get rid of the extra vmalloc area check. With the current values of CONFIG_MAX_PHYSMEM_BITS and PAGES_PER_SECTION the sum of maximal possible size of identity mapping and vmemmap area (derived from these macros) plus modules area size MODULES_LEN can not overflow. Thus, that sum is used as first addend while vmalloc area size is second addend for size_add(). Suggested-by: Heiko Carstens Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/boot/startup.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index a1f792fcc710..b058e2a575c1 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -177,6 +177,7 @@ static unsigned long setup_kernel_memory_layout(void) unsigned long asce_limit; unsigned long rte_size; unsigned long pages; + unsigned long vsize; unsigned long vmax; pages = ident_map_size / PAGE_SIZE; @@ -184,11 +185,9 @@ static unsigned long setup_kernel_memory_layout(void) vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); /* choose kernel address space layout: 4 or 3 levels. */ - vmemmap_start = round_up(ident_map_size, _REGION3_SIZE); - if (IS_ENABLED(CONFIG_KASAN) || - vmalloc_size > _REGION2_SIZE || - vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN > - _REGION2_SIZE) { + vsize = round_up(ident_map_size, _REGION3_SIZE) + vmemmap_size + MODULES_LEN; + vsize = size_add(vsize, vmalloc_size); + if (IS_ENABLED(CONFIG_KASAN) || (vsize > _REGION2_SIZE)) { asce_limit = _REGION1_SIZE; rte_size = _REGION2_SIZE; } else { -- cgit v1.2.3 From a984f27ec26323204045c306f8b25bc61e042626 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 4 Aug 2023 15:39:06 +0200 Subject: s390/mm: define Real Memory Copy size and mask macros Make Real Memory Copy area size and mask explicit. This does not bring any functional change and only needed for clarity. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/boot/startup.c | 2 +- arch/s390/include/asm/maccess.h | 3 +++ arch/s390/mm/dump_pagetables.c | 2 +- arch/s390/mm/maccess.c | 7 ++++--- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index b058e2a575c1..d61428190cdd 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -204,7 +204,7 @@ static unsigned long setup_kernel_memory_layout(void) /* force vmalloc and modules below kasan shadow */ vmax = min(vmax, KASAN_SHADOW_START); #endif - __memcpy_real_area = round_down(vmax - PAGE_SIZE, PAGE_SIZE); + __memcpy_real_area = round_down(vmax - MEMCPY_REAL_SIZE, PAGE_SIZE); __abs_lowcore = round_down(__memcpy_real_area - ABS_LOWCORE_MAP_SIZE, sizeof(struct lowcore)); MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE); diff --git a/arch/s390/include/asm/maccess.h b/arch/s390/include/asm/maccess.h index cfec3141fdba..50225940d971 100644 --- a/arch/s390/include/asm/maccess.h +++ b/arch/s390/include/asm/maccess.h @@ -4,6 +4,9 @@ #include +#define MEMCPY_REAL_SIZE PAGE_SIZE +#define MEMCPY_REAL_MASK PAGE_MASK + struct iov_iter; extern unsigned long __memcpy_real_area; diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index ba5f80268878..afa5db750d92 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -297,7 +297,7 @@ static int pt_dump_init(void) address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore; address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE; address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area; - address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + PAGE_SIZE; + address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + MEMCPY_REAL_SIZE; address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size; address_markers[VMALLOC_NR].start_address = VMALLOC_START; diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index cbe1df1e9c18..c805b3e2592b 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -86,11 +86,12 @@ size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count) void *chunk; pte_t pte; + BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE); while (count) { - phys = src & PAGE_MASK; - offset = src & ~PAGE_MASK; + phys = src & MEMCPY_REAL_MASK; + offset = src & ~MEMCPY_REAL_MASK; chunk = (void *)(__memcpy_real_area + offset); - len = min(count, PAGE_SIZE - offset); + len = min(count, MEMCPY_REAL_SIZE - offset); pte = mk_pte_phys(phys, PAGE_KERNEL_RO); mutex_lock(&memcpy_real_mutex); -- cgit v1.2.3 From 09cd4ffafb2fbb1a18a88890b13520c501409d99 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 4 Aug 2023 15:24:28 +0200 Subject: s390/boot: account Real Memory Copy and Lowcore areas Real Memory Copy and (absolute) Lowcore areas are not accounted when virtual memory layout is set up. Fixes: 4df29d2b9024 ("s390/smp: rework absolute lowcore access") Fixes: 2f0e8aae26a2 ("s390/mm: rework memcpy_real() to avoid DAT-off mode") Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/boot/startup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index d61428190cdd..a81f92563037 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -185,7 +185,8 @@ static unsigned long setup_kernel_memory_layout(void) vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); /* choose kernel address space layout: 4 or 3 levels. */ - vsize = round_up(ident_map_size, _REGION3_SIZE) + vmemmap_size + MODULES_LEN; + vsize = round_up(ident_map_size, _REGION3_SIZE) + vmemmap_size + + MODULES_LEN + MEMCPY_REAL_SIZE + ABS_LOWCORE_MAP_SIZE; vsize = size_add(vsize, vmalloc_size); if (IS_ENABLED(CONFIG_KASAN) || (vsize > _REGION2_SIZE)) { asce_limit = _REGION1_SIZE; -- cgit v1.2.3 From 5cfdff02e97a46f90b3ba408af62ad3dcb0dc586 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Sat, 5 Aug 2023 10:59:09 +0200 Subject: s390/boot: fix multi-line comments style Make multi-line comment style consistent across the source. Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/boot/startup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index a81f92563037..b9681cb22753 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -195,8 +195,9 @@ static unsigned long setup_kernel_memory_layout(void) asce_limit = _REGION2_SIZE; rte_size = _REGION3_SIZE; } + /* - * forcing modules and vmalloc area under the ultravisor + * Forcing modules and vmalloc area under the ultravisor * secure storage limit, so that any vmalloc allocation * we do could be used to back secure guest storage. */ @@ -288,8 +289,9 @@ void startup_kernel(void) setup_lpp(); safe_addr = mem_safe_offset(); + /* - * reserve decompressor memory together with decompression heap, buffer and + * Reserve decompressor memory together with decompression heap, buffer and * memory which might be occupied by uncompressed kernel at default 1Mb * position (if KASLR is off or failed). */ -- cgit v1.2.3 From 2d1494fb31405df0dfb6006fdb2b24e7880258cd Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 12 Aug 2023 17:12:54 +0200 Subject: s390/mm: make virt_to_pfn() a static inline Making virt_to_pfn() a static inline taking a strongly typed (const void *) makes the contract of a passing a pointer of that type to the function explicit and exposes any misuse of the macro virt_to_pfn() acting polymorphic and accepting many types such as (void *), (unitptr_t) or (unsigned long) as arguments without warnings. For symmetry do the same with pfn_to_virt() reflecting the current layout in asm-generic/page.h. Doing this reveals a number of offenders in the arch code and the S390-specific drivers, so just bite the bullet and fix up all of those as well. Signed-off-by: Linus Walleij Reviewed-by: Alexander Gordeev Link: https://lore.kernel.org/r/20230812-virt-to-phys-s390-v2-1-6c40f31fe36f@linaro.org Signed-off-by: Heiko Carstens --- arch/s390/include/asm/kfence.h | 2 +- arch/s390/include/asm/page.h | 12 ++++++++++-- arch/s390/mm/cmm.c | 2 +- arch/s390/mm/vmem.c | 2 +- drivers/s390/block/scm_blk.c | 2 +- drivers/s390/char/vmcp.c | 2 +- 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/kfence.h b/arch/s390/include/asm/kfence.h index d55ba878378b..e47fd8cbe701 100644 --- a/arch/s390/include/asm/kfence.h +++ b/arch/s390/include/asm/kfence.h @@ -35,7 +35,7 @@ static __always_inline void kfence_split_mapping(void) static inline bool kfence_protect_page(unsigned long addr, bool protect) { - __kernel_map_pages(virt_to_page(addr), 1, !protect); + __kernel_map_pages(virt_to_page((void *)addr), 1, !protect); return true; } diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index a9c138fcd2ad..cfec0743314e 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -191,8 +191,16 @@ int arch_make_page_accessible(struct page *page); #define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys)) #define page_to_phys(page) pfn_to_phys(page_to_pfn(page)) -#define pfn_to_virt(pfn) __va(pfn_to_phys(pfn)) -#define virt_to_pfn(kaddr) (phys_to_pfn(__pa(kaddr))) +static inline void *pfn_to_virt(unsigned long pfn) +{ + return __va(pfn_to_phys(pfn)); +} + +static inline unsigned long virt_to_pfn(const void *kaddr) +{ + return phys_to_pfn(__pa(kaddr)); +} + #define pfn_to_kaddr(pfn) pfn_to_virt(pfn) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 5300c6867d5e..f47515313226 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -90,7 +90,7 @@ static long cmm_alloc_pages(long nr, long *counter, } else free_page((unsigned long) npa); } - diag10_range(virt_to_pfn(addr), 1); + diag10_range(virt_to_pfn((void *)addr), 1); pa->pages[pa->index++] = addr; (*counter)++; spin_unlock(&cmm_lock); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index be69cb2d47eb..3391efb59641 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -36,7 +36,7 @@ static void vmem_free_pages(unsigned long addr, int order) { /* We don't expect boot memory to be removed ever. */ if (!slab_is_available() || - WARN_ON_ONCE(PageReserved(virt_to_page(addr)))) + WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr)))) return; free_pages(addr, order); } diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 0c1df1d5f1ac..3a9cc8a4a230 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -134,7 +134,7 @@ static void scm_request_done(struct scm_request *scmrq) if ((msb->flags & MSB_FLAG_IDA) && aidaw && IS_ALIGNED(aidaw, PAGE_SIZE)) - mempool_free(virt_to_page(aidaw), aidaw_pool); + mempool_free(virt_to_page((void *)aidaw), aidaw_pool); } spin_lock_irqsave(&list_lock, flags); diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 4cebfaaa22b4..eb0520a9d4af 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -89,7 +89,7 @@ static void vmcp_response_free(struct vmcp_session *session) order = get_order(session->bufsize); nr_pages = ALIGN(session->bufsize, PAGE_SIZE) >> PAGE_SHIFT; if (session->cma_alloc) { - page = virt_to_page((unsigned long)session->response); + page = virt_to_page(session->response); cma_release(vmcp_cma, page, nr_pages); session->cma_alloc = 0; } else { -- cgit v1.2.3 From c8f40a0bccefd613748d080147469a4652d6e74c Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 10 Aug 2023 10:22:36 +0200 Subject: s390/dcssblk: fix kernel crash with list_add corruption Commit fb08a1908cb1 ("dax: simplify the dax_device <-> gendisk association") introduced new logic for gendisk association, requiring drivers to explicitly call dax_add_host() and dax_remove_host(). For dcssblk driver, some dax_remove_host() calls were missing, e.g. in device remove path. The commit also broke error handling for out_dax case in device add path, resulting in an extra put_device() w/o the previous get_device() in that case. This lead to stale xarray entries after device add / remove cycles. In the case when a previously used struct gendisk pointer (xarray index) would be used again, because blk_alloc_disk() happened to return such a pointer, the xa_insert() in dax_add_host() would fail and go to out_dax, doing the extra put_device() in the error path. In combination with an already flawed error handling in dcssblk (device_register() cleanup), which needs to be addressed in a separate patch, this resulted in a missing device_del() / klist_del(), and eventually in the kernel crash with list_add corruption on a subsequent device_add() / klist_add(). Fix this by adding the missing dax_remove_host() calls, and also move the put_device() in the error path to restore the previous logic. Fixes: fb08a1908cb1 ("dax: simplify the dax_device <-> gendisk association") Cc: # 5.17+ Acked-by: Heiko Carstens Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens --- drivers/s390/block/dcssblk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 6eafd0a34483..06bcb6c78909 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -412,6 +412,7 @@ removeseg: } list_del(&dev_info->lh); + dax_remove_host(dev_info->gd); kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); del_gendisk(dev_info->gd); @@ -707,9 +708,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char goto out; out_dax_host: + put_device(&dev_info->dev); dax_remove_host(dev_info->gd); out_dax: - put_device(&dev_info->dev); kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); put_dev: @@ -789,6 +790,7 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch } list_del(&dev_info->lh); + dax_remove_host(dev_info->gd); kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); del_gendisk(dev_info->gd); -- cgit v1.2.3 From ea5717cb13468323a7c3dd394748301802991f39 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 15 Aug 2023 09:26:06 +0200 Subject: s390/ipl: add missing secure/has_secure file to ipl type 'unknown' OS installers are relying on /sys/firmware/ipl/has_secure to be present on machines supporting secure boot. This file is present for all IPL types, but not the unknown type, which prevents a secure installation when an LPAR is booted in HMC via FTP(s), because this is an unknown IPL type in linux. While at it, also add the secure file. Fixes: c9896acc7851 ("s390/ipl: Provide has_secure sysfs attribute") Cc: stable@vger.kernel.org Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/kernel/ipl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 85a00d97a314..dfcb2b563e2b 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -640,6 +640,8 @@ static struct attribute_group ipl_ccw_attr_group_lpar = { static struct attribute *ipl_unknown_attrs[] = { &sys_ipl_type_attr.attr, + &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; -- cgit v1.2.3 From 7645dcddc2666f57e37ed3dda50b8b273b62c88a Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 15 Aug 2023 10:27:08 +0200 Subject: s390/ipl: add common ipl parameter attribute group All ipl types have 'secure','has_secure' and type parameters. Move these to a common ipl parameter group so that they don't need to be present in each ipl parameter group. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/kernel/ipl.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index dfcb2b563e2b..7f3a84e414a4 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -557,15 +557,12 @@ static struct kobj_attribute sys_ipl_ccw_loadparm_attr = __ATTR(loadparm, 0444, ipl_ccw_loadparm_show, NULL); static struct attribute *ipl_fcp_attrs[] = { - &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_fcp_wwpn_attr.attr, &sys_ipl_fcp_lun_attr.attr, &sys_ipl_fcp_bootprog_attr.attr, &sys_ipl_fcp_br_lba_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; @@ -575,14 +572,11 @@ static struct attribute_group ipl_fcp_attr_group = { }; static struct attribute *ipl_nvme_attrs[] = { - &sys_ipl_type_attr.attr, &sys_ipl_nvme_fid_attr.attr, &sys_ipl_nvme_nsid_attr.attr, &sys_ipl_nvme_bootprog_attr.attr, &sys_ipl_nvme_br_lba_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; @@ -592,13 +586,10 @@ static struct attribute_group ipl_nvme_attr_group = { }; static struct attribute *ipl_eckd_attrs[] = { - &sys_ipl_type_attr.attr, &sys_ipl_eckd_bootprog_attr.attr, &sys_ipl_eckd_br_chr_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, &sys_ipl_device_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; @@ -610,21 +601,15 @@ static struct attribute_group ipl_eckd_attr_group = { /* CCW ipl device attributes */ static struct attribute *ipl_ccw_attrs_vm[] = { - &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, &sys_ipl_vm_parm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; static struct attribute *ipl_ccw_attrs_lpar[] = { - &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, - &sys_ipl_secure_attr.attr, - &sys_ipl_has_secure_attr.attr, NULL, }; @@ -636,17 +621,15 @@ static struct attribute_group ipl_ccw_attr_group_lpar = { .attrs = ipl_ccw_attrs_lpar }; -/* UNKNOWN ipl device attributes */ - -static struct attribute *ipl_unknown_attrs[] = { +static struct attribute *ipl_common_attrs[] = { &sys_ipl_type_attr.attr, &sys_ipl_secure_attr.attr, &sys_ipl_has_secure_attr.attr, NULL, }; -static struct attribute_group ipl_unknown_attr_group = { - .attrs = ipl_unknown_attrs, +static struct attribute_group ipl_common_attr_group = { + .attrs = ipl_common_attrs, }; static struct kset *ipl_kset; @@ -670,6 +653,9 @@ static int __init ipl_init(void) rc = -ENOMEM; goto out; } + rc = sysfs_create_group(&ipl_kset->kobj, &ipl_common_attr_group); + if (rc) + goto out; switch (ipl_info.type) { case IPL_TYPE_CCW: if (MACHINE_IS_VM) @@ -691,8 +677,6 @@ static int __init ipl_init(void) rc = sysfs_create_group(&ipl_kset->kobj, &ipl_nvme_attr_group); break; default: - rc = sysfs_create_group(&ipl_kset->kobj, - &ipl_unknown_attr_group); break; } out: -- cgit v1.2.3 From 37a08f010b7c423b5e4c9ed3b187d21166553007 Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Wed, 26 Jul 2023 11:33:45 +0200 Subject: s390/pkey: fix/harmonize internal keyblob headers Commit 'fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys")' introduced PKEY_TYPE_EP11_AES as a supplement to PKEY_TYPE_EP11. All pkeys have an internal header/payload structure, which is opaque to the userspace. The header structures for PKEY_TYPE_EP11 and PKEY_TYPE_EP11_AES are nearly identical and there is no reason, why different structures are used. In preparation to fix the keyversion handling in the broken PKEY IOCTLs, the same header structure is used for PKEY_TYPE_EP11 and PKEY_TYPE_EP11_AES. This reduces the number of different code paths and increases the readability. Fixes: fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys") Signed-off-by: Holger Dengler Reviewed-by: Ingo Franzki Signed-off-by: Heiko Carstens --- drivers/s390/crypto/pkey_api.c | 2 +- drivers/s390/crypto/zcrypt_ep11misc.c | 4 ++-- drivers/s390/crypto/zcrypt_ep11misc.h | 9 +-------- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index e58bfd225323..ba8581e0809c 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -895,7 +895,7 @@ static int pkey_verifykey2(const u8 *key, size_t keylen, if (ktype) *ktype = PKEY_TYPE_EP11; if (ksize) - *ksize = kb->head.keybitlen; + *ksize = kb->head.bitlen; rc = ep11_findcard2(&_apqns, &_nr_apqns, *cardnr, *domain, ZCRYPT_CEX7, EP11_API_V, kb->wkvp); diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c index 958f5ee47f1b..d7ecd6ce5b7a 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.c +++ b/drivers/s390/crypto/zcrypt_ep11misc.c @@ -787,7 +787,7 @@ int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, kb->head.type = TOKTYPE_NON_CCA; kb->head.len = rep_pl->data_len; kb->head.version = TOKVER_EP11_AES; - kb->head.keybitlen = keybitsize; + kb->head.bitlen = keybitsize; out: kfree(req); @@ -1055,7 +1055,7 @@ static int ep11_unwrapkey(u16 card, u16 domain, kb->head.type = TOKTYPE_NON_CCA; kb->head.len = rep_pl->data_len; kb->head.version = TOKVER_EP11_AES; - kb->head.keybitlen = keybitsize; + kb->head.bitlen = keybitsize; out: kfree(req); diff --git a/drivers/s390/crypto/zcrypt_ep11misc.h b/drivers/s390/crypto/zcrypt_ep11misc.h index a3eddf51242d..67cc80d71ba3 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.h +++ b/drivers/s390/crypto/zcrypt_ep11misc.h @@ -29,14 +29,7 @@ struct ep11keyblob { union { u8 session[32]; /* only used for PKEY_TYPE_EP11: */ - struct { - u8 type; /* 0x00 (TOKTYPE_NON_CCA) */ - u8 res0; /* unused */ - u16 len; /* total length in bytes of this blob */ - u8 version; /* 0x03 (TOKVER_EP11_AES) */ - u8 res1; /* unused */ - u16 keybitlen; /* clear key bit len, 0 for unknown */ - } head; + struct ep11kblob_header head; }; u8 wkvp[16]; /* wrapping key verification pattern */ u64 attr; /* boolean key attributes */ -- cgit v1.2.3 From fb249ce7f7bfd8621a38e4ad401ba74b680786d4 Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Tue, 25 Jul 2023 09:49:55 +0200 Subject: s390/pkey: fix PKEY_TYPE_EP11_AES handling in PKEY_GENSECK2 IOCTL Commit 'fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys")' introduced PKEY_TYPE_EP11_AES for the PKEY_GENSECK2 IOCTL, to enable userspace to generate securekey blobs of this type. Unfortunately, all PKEY_GENSECK2 IOCTL requests for PKEY_TYPE_EP11_AES return with an error (-EINVAL). Fix the handling for PKEY_TYPE_EP11_AES in PKEY_GENSECK2 IOCTL, so that userspace can generate securekey blobs of this type. The start of the header and the keyblob, as well as the length need special handling, depending on the internal keyversion. Add a helper function that splits an uninitialized buffer into start and size of the header as well as start and size of the payload, depending on the requested keyversion. Do the header-related calculations and the raw genkey request handling in separate functions. Use the raw genkey request function for internal purposes. Fixes: fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys") Signed-off-by: Holger Dengler Reviewed-by: Ingo Franzki Signed-off-by: Heiko Carstens --- drivers/s390/crypto/pkey_api.c | 18 ++++-- drivers/s390/crypto/zcrypt_ep11misc.c | 103 +++++++++++++++++++++++++++++----- drivers/s390/crypto/zcrypt_ep11misc.h | 2 +- 3 files changed, 102 insertions(+), 21 deletions(-) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index ba8581e0809c..2661d6a9ea13 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -713,6 +713,11 @@ static int pkey_genseckey2(const struct pkey_apqn *apqns, size_t nr_apqns, if (*keybufsize < MINEP11AESKEYBLOBSIZE) return -EINVAL; break; + case PKEY_TYPE_EP11_AES: + if (*keybufsize < (sizeof(struct ep11kblob_header) + + MINEP11AESKEYBLOBSIZE)) + return -EINVAL; + break; default: return -EINVAL; } @@ -729,9 +734,10 @@ static int pkey_genseckey2(const struct pkey_apqn *apqns, size_t nr_apqns, for (i = 0, rc = -ENODEV; i < nr_apqns; i++) { card = apqns[i].card; dom = apqns[i].domain; - if (ktype == PKEY_TYPE_EP11) { + if (ktype == PKEY_TYPE_EP11 || + ktype == PKEY_TYPE_EP11_AES) { rc = ep11_genaeskey(card, dom, ksize, kflags, - keybuf, keybufsize); + keybuf, keybufsize, ktype); } else if (ktype == PKEY_TYPE_CCA_DATA) { rc = cca_genseckey(card, dom, ksize, keybuf); *keybufsize = (rc ? 0 : SECKEYBLOBSIZE); @@ -1466,7 +1472,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd, apqns = _copy_apqns_from_user(kgs.apqns, kgs.apqn_entries); if (IS_ERR(apqns)) return PTR_ERR(apqns); - kkey = kmalloc(klen, GFP_KERNEL); + kkey = kzalloc(klen, GFP_KERNEL); if (!kkey) { kfree(apqns); return -ENOMEM; @@ -2130,7 +2136,8 @@ static ssize_t pkey_ep11_aes_attr_read(enum pkey_key_size keybits, for (i = 0, rc = -ENODEV; i < nr_apqns; i++) { card = apqns[i] >> 16; dom = apqns[i] & 0xFFFF; - rc = ep11_genaeskey(card, dom, keybits, 0, buf, &keysize); + rc = ep11_genaeskey(card, dom, keybits, 0, buf, &keysize, + PKEY_TYPE_EP11); if (rc == 0) break; } @@ -2140,7 +2147,8 @@ static ssize_t pkey_ep11_aes_attr_read(enum pkey_key_size keybits, if (is_xts) { keysize = MAXEP11AESKEYBLOBSIZE; buf += MAXEP11AESKEYBLOBSIZE; - rc = ep11_genaeskey(card, dom, keybits, 0, buf, &keysize); + rc = ep11_genaeskey(card, dom, keybits, 0, buf, &keysize, + PKEY_TYPE_EP11); if (rc == 0) return 2 * MAXEP11AESKEYBLOBSIZE; } diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c index d7ecd6ce5b7a..51f6753e01c5 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.c +++ b/drivers/s390/crypto/zcrypt_ep11misc.c @@ -113,6 +113,50 @@ static void __exit card_cache_free(void) spin_unlock_bh(&card_list_lock); } +static int ep11_kb_split(const u8 *kb, size_t kblen, u32 kbver, + struct ep11kblob_header **kbhdr, size_t *kbhdrsize, + u8 **kbpl, size_t *kbplsize) +{ + struct ep11kblob_header *hdr = NULL; + size_t hdrsize, plsize = 0; + int rc = -EINVAL; + u8 *pl = NULL; + + if (kblen < sizeof(struct ep11kblob_header)) + goto out; + hdr = (struct ep11kblob_header *)kb; + + switch (kbver) { + case TOKVER_EP11_AES: + /* header overlays the payload */ + hdrsize = 0; + break; + case TOKVER_EP11_ECC_WITH_HEADER: + case TOKVER_EP11_AES_WITH_HEADER: + /* payload starts after the header */ + hdrsize = sizeof(struct ep11kblob_header); + break; + default: + goto out; + } + + plsize = kblen - hdrsize; + pl = (u8 *)kb + hdrsize; + + if (kbhdr) + *kbhdr = hdr; + if (kbhdrsize) + *kbhdrsize = hdrsize; + if (kbpl) + *kbpl = pl; + if (kbplsize) + *kbplsize = plsize; + + rc = 0; +out: + return rc; +} + /* * Simple check if the key blob is a valid EP11 AES key blob with header. */ @@ -664,8 +708,9 @@ EXPORT_SYMBOL(ep11_get_domain_info); */ #define KEY_ATTR_DEFAULTS 0x00200c00 -int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, - u8 *keybuf, size_t *keybufsize) +static int _ep11_genaeskey(u16 card, u16 domain, + u32 keybitsize, u32 keygenflags, + u8 *keybuf, size_t *keybufsize) { struct keygen_req_pl { struct pl_head head; @@ -701,7 +746,6 @@ int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, struct ep11_cprb *req = NULL, *rep = NULL; struct ep11_target_dev target; struct ep11_urb *urb = NULL; - struct ep11keyblob *kb; int api, rc = -ENOMEM; switch (keybitsize) { @@ -780,14 +824,9 @@ int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, goto out; } - /* copy key blob and set header values */ + /* copy key blob */ memcpy(keybuf, rep_pl->data, rep_pl->data_len); *keybufsize = rep_pl->data_len; - kb = (struct ep11keyblob *)keybuf; - kb->head.type = TOKTYPE_NON_CCA; - kb->head.len = rep_pl->data_len; - kb->head.version = TOKVER_EP11_AES; - kb->head.bitlen = keybitsize; out: kfree(req); @@ -795,6 +834,43 @@ out: kfree(urb); return rc; } + +int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, + u8 *keybuf, size_t *keybufsize, u32 keybufver) +{ + struct ep11kblob_header *hdr; + size_t hdr_size, pl_size; + u8 *pl; + int rc; + + switch (keybufver) { + case TOKVER_EP11_AES: + case TOKVER_EP11_AES_WITH_HEADER: + break; + default: + return -EINVAL; + } + + rc = ep11_kb_split(keybuf, *keybufsize, keybufver, + &hdr, &hdr_size, &pl, &pl_size); + if (rc) + return rc; + + rc = _ep11_genaeskey(card, domain, keybitsize, keygenflags, + pl, &pl_size); + if (rc) + return rc; + + *keybufsize = hdr_size + pl_size; + + /* update header information */ + hdr->type = TOKTYPE_NON_CCA; + hdr->len = *keybufsize; + hdr->version = keybufver; + hdr->bitlen = keybitsize; + + return 0; +} EXPORT_SYMBOL(ep11_genaeskey); static int ep11_cryptsingle(u16 card, u16 domain, @@ -1201,7 +1277,6 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, const u8 *clrkey, u8 *keybuf, size_t *keybufsize) { int rc; - struct ep11keyblob *kb; u8 encbuf[64], *kek = NULL; size_t clrkeylen, keklen, encbuflen = sizeof(encbuf); @@ -1223,17 +1298,15 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, } /* Step 1: generate AES 256 bit random kek key */ - rc = ep11_genaeskey(card, domain, 256, - 0x00006c00, /* EN/DECRYPT, WRAP/UNWRAP */ - kek, &keklen); + rc = _ep11_genaeskey(card, domain, 256, + 0x00006c00, /* EN/DECRYPT, WRAP/UNWRAP */ + kek, &keklen); if (rc) { DEBUG_ERR( "%s generate kek key failed, rc=%d\n", __func__, rc); goto out; } - kb = (struct ep11keyblob *)kek; - memset(&kb->head, 0, sizeof(kb->head)); /* Step 2: encrypt clear key value with the kek key */ rc = ep11_cryptsingle(card, domain, 0, 0, def_iv, kek, keklen, diff --git a/drivers/s390/crypto/zcrypt_ep11misc.h b/drivers/s390/crypto/zcrypt_ep11misc.h index 67cc80d71ba3..2eecbd7be6e5 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.h +++ b/drivers/s390/crypto/zcrypt_ep11misc.h @@ -107,7 +107,7 @@ int ep11_get_domain_info(u16 card, u16 domain, struct ep11_domain_info *info); * Generate (random) EP11 AES secure key. */ int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, - u8 *keybuf, size_t *keybufsize); + u8 *keybuf, size_t *keybufsize, u32 keybufver); /* * Generate EP11 AES secure key with given clear key value. -- cgit v1.2.3 From da2863f15945de100b95c72d5656541d30956c5d Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Tue, 25 Jul 2023 11:24:47 +0200 Subject: s390/pkey: fix PKEY_TYPE_EP11_AES handling in PKEY_CLR2SECK2 IOCTL Commit 'fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys")' introduced PKEY_TYPE_EP11_AES for the PKEY_CLR2SECK2 IOCTL to convert an AES clearkey into a securekey of this type. Unfortunately, all PKEY_CLR2SECK2 IOCTL requests with type PKEY_TYPE_EP11_AES return with an error (-EINVAL). Fix the handling for PKEY_TYPE_EP11_AES in PKEY_CLR2SECK2 IOCTL, so that userspace can convert clearkey blobs into PKEY_TYPE_EP11_AES securekey blobs. Cc: stable@vger.kernel.org # v5.10+ Fixes: fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys") Signed-off-by: Holger Dengler Reviewed-by: Ingo Franzki Signed-off-by: Heiko Carstens --- drivers/s390/crypto/pkey_api.c | 16 ++++++--- drivers/s390/crypto/zcrypt_ep11misc.c | 61 ++++++++++++++++++++++++++--------- drivers/s390/crypto/zcrypt_ep11misc.h | 3 +- 3 files changed, 60 insertions(+), 20 deletions(-) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index 2661d6a9ea13..7543757c82e2 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -272,7 +272,8 @@ static int pkey_clr2ep11key(const u8 *clrkey, size_t clrkeylen, card = apqns[i] >> 16; dom = apqns[i] & 0xFFFF; rc = ep11_clr2keyblob(card, dom, clrkeylen * 8, - 0, clrkey, keybuf, keybuflen); + 0, clrkey, keybuf, keybuflen, + PKEY_TYPE_EP11); if (rc == 0) break; } @@ -775,6 +776,11 @@ static int pkey_clr2seckey2(const struct pkey_apqn *apqns, size_t nr_apqns, if (*keybufsize < MINEP11AESKEYBLOBSIZE) return -EINVAL; break; + case PKEY_TYPE_EP11_AES: + if (*keybufsize < (sizeof(struct ep11kblob_header) + + MINEP11AESKEYBLOBSIZE)) + return -EINVAL; + break; default: return -EINVAL; } @@ -793,9 +799,11 @@ static int pkey_clr2seckey2(const struct pkey_apqn *apqns, size_t nr_apqns, for (i = 0, rc = -ENODEV; i < nr_apqns; i++) { card = apqns[i].card; dom = apqns[i].domain; - if (ktype == PKEY_TYPE_EP11) { + if (ktype == PKEY_TYPE_EP11 || + ktype == PKEY_TYPE_EP11_AES) { rc = ep11_clr2keyblob(card, dom, ksize, kflags, - clrkey, keybuf, keybufsize); + clrkey, keybuf, keybufsize, + ktype); } else if (ktype == PKEY_TYPE_CCA_DATA) { rc = cca_clr2seckey(card, dom, ksize, clrkey, keybuf); @@ -1514,7 +1522,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd, apqns = _copy_apqns_from_user(kcs.apqns, kcs.apqn_entries); if (IS_ERR(apqns)) return PTR_ERR(apqns); - kkey = kmalloc(klen, GFP_KERNEL); + kkey = kzalloc(klen, GFP_KERNEL); if (!kkey) { kfree(apqns); return -ENOMEM; diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c index 51f6753e01c5..355d30bc0aac 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.c +++ b/drivers/s390/crypto/zcrypt_ep11misc.c @@ -1000,12 +1000,12 @@ out: return rc; } -static int ep11_unwrapkey(u16 card, u16 domain, - const u8 *kek, size_t keksize, - const u8 *enckey, size_t enckeysize, - u32 mech, const u8 *iv, - u32 keybitsize, u32 keygenflags, - u8 *keybuf, size_t *keybufsize) +static int _ep11_unwrapkey(u16 card, u16 domain, + const u8 *kek, size_t keksize, + const u8 *enckey, size_t enckeysize, + u32 mech, const u8 *iv, + u32 keybitsize, u32 keygenflags, + u8 *keybuf, size_t *keybufsize) { struct uw_req_pl { struct pl_head head; @@ -1042,7 +1042,6 @@ static int ep11_unwrapkey(u16 card, u16 domain, struct ep11_cprb *req = NULL, *rep = NULL; struct ep11_target_dev target; struct ep11_urb *urb = NULL; - struct ep11keyblob *kb; size_t req_pl_size; int api, rc = -ENOMEM; u8 *p; @@ -1124,14 +1123,9 @@ static int ep11_unwrapkey(u16 card, u16 domain, goto out; } - /* copy key blob and set header values */ + /* copy key blob */ memcpy(keybuf, rep_pl->data, rep_pl->data_len); *keybufsize = rep_pl->data_len; - kb = (struct ep11keyblob *)keybuf; - kb->head.type = TOKTYPE_NON_CCA; - kb->head.len = rep_pl->data_len; - kb->head.version = TOKVER_EP11_AES; - kb->head.bitlen = keybitsize; out: kfree(req); @@ -1140,6 +1134,42 @@ out: return rc; } +static int ep11_unwrapkey(u16 card, u16 domain, + const u8 *kek, size_t keksize, + const u8 *enckey, size_t enckeysize, + u32 mech, const u8 *iv, + u32 keybitsize, u32 keygenflags, + u8 *keybuf, size_t *keybufsize, + u8 keybufver) +{ + struct ep11kblob_header *hdr; + size_t hdr_size, pl_size; + u8 *pl; + int rc; + + rc = ep11_kb_split(keybuf, *keybufsize, keybufver, + &hdr, &hdr_size, &pl, &pl_size); + if (rc) + return rc; + + rc = _ep11_unwrapkey(card, domain, kek, keksize, enckey, enckeysize, + mech, iv, keybitsize, keygenflags, + pl, &pl_size); + if (rc) + return rc; + + *keybufsize = hdr_size + pl_size; + + /* update header information */ + hdr = (struct ep11kblob_header *)keybuf; + hdr->type = TOKTYPE_NON_CCA; + hdr->len = *keybufsize; + hdr->version = keybufver; + hdr->bitlen = keybitsize; + + return 0; +} + static int ep11_wrapkey(u16 card, u16 domain, const u8 *key, size_t keysize, u32 mech, const u8 *iv, @@ -1274,7 +1304,8 @@ out: } int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, - const u8 *clrkey, u8 *keybuf, size_t *keybufsize) + const u8 *clrkey, u8 *keybuf, size_t *keybufsize, + u32 keytype) { int rc; u8 encbuf[64], *kek = NULL; @@ -1321,7 +1352,7 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, /* Step 3: import the encrypted key value as a new key */ rc = ep11_unwrapkey(card, domain, kek, keklen, encbuf, encbuflen, 0, def_iv, - keybitsize, 0, keybuf, keybufsize); + keybitsize, 0, keybuf, keybufsize, keytype); if (rc) { DEBUG_ERR( "%s importing key value as new key failed,, rc=%d\n", diff --git a/drivers/s390/crypto/zcrypt_ep11misc.h b/drivers/s390/crypto/zcrypt_ep11misc.h index 2eecbd7be6e5..b611cf64231d 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.h +++ b/drivers/s390/crypto/zcrypt_ep11misc.h @@ -113,7 +113,8 @@ int ep11_genaeskey(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, * Generate EP11 AES secure key with given clear key value. */ int ep11_clr2keyblob(u16 cardnr, u16 domain, u32 keybitsize, u32 keygenflags, - const u8 *clrkey, u8 *keybuf, size_t *keybufsize); + const u8 *clrkey, u8 *keybuf, size_t *keybufsize, + u32 keytype); /* * Build a list of ep11 apqns meeting the following constrains: -- cgit v1.2.3 From d1fdfb0b2f339cf882c0b5431084a1950b8b73b9 Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Wed, 26 Jul 2023 16:22:19 +0200 Subject: s390/pkey: fix PKEY_TYPE_EP11_AES handling in PKEY_KBLOB2PROTK[23] Commit 'fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys")' introduced a new PKEY_TYPE_EP11_AES type for the PKEY_KBLOB2PROTK2 and a new IOCTL, PKEY_KBLOB2PROTK3, which both allows userspace to convert opaque securekey blobs of this type into protectedkey blobs. Unfortunately, all PKEY_KBLOB2PROTK2 and PKEY_KBLOB2PROTK3 IOCTL requests with this keyblobs of this type return with an error (-EINVAL). Fix PKEY_TYPE_EP11_AES handling in PKEY_KBLOB2PROTK2 and PKEY_KBLOB2PROTK3 IOCTLs, so that userspace can convert PKEY_TYPE_EP11_AES keyblobs into protectedkey blobs. Add a helper function to decode the start and size of the internal header as well as start and size of the keyblob payload of an existing keyblob. Also validate the length of header and keyblob, as well as the keyblob magic. Introduce another helper function, which handles a raw key wrapping request and do the keyblob decoding in the calling function. Remove all other header-related calculations. Fixes: fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys") Signed-off-by: Holger Dengler Reviewed-by: Ingo Franzki Signed-off-by: Heiko Carstens --- drivers/s390/crypto/pkey_api.c | 33 ++++----- drivers/s390/crypto/zcrypt_ep11misc.c | 123 +++++++++++++++++++++------------- drivers/s390/crypto/zcrypt_ep11misc.h | 6 ++ 3 files changed, 100 insertions(+), 62 deletions(-) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index 7543757c82e2..75d7f0d5f14e 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -288,10 +288,9 @@ out: /* * Find card and transform EP11 secure key into protected key. */ -static int pkey_ep11key2pkey(const u8 *key, u8 *protkey, - u32 *protkeylen, u32 *protkeytype) +static int pkey_ep11key2pkey(const u8 *key, size_t keylen, + u8 *protkey, u32 *protkeylen, u32 *protkeytype) { - struct ep11keyblob *kb = (struct ep11keyblob *)key; u32 nr_apqns, *apqns = NULL; u16 card, dom; int i, rc; @@ -300,7 +299,8 @@ static int pkey_ep11key2pkey(const u8 *key, u8 *protkey, /* build a list of apqns suitable for this key */ rc = ep11_findcard2(&apqns, &nr_apqns, 0xFFFF, 0xFFFF, - ZCRYPT_CEX7, EP11_API_V, kb->wkvp); + ZCRYPT_CEX7, EP11_API_V, + ep11_kb_wkvp(key, keylen)); if (rc) goto out; @@ -308,7 +308,7 @@ static int pkey_ep11key2pkey(const u8 *key, u8 *protkey, for (rc = -ENODEV, i = 0; i < nr_apqns; i++) { card = apqns[i] >> 16; dom = apqns[i] & 0xFFFF; - rc = ep11_kblob2protkey(card, dom, key, kb->head.len, + rc = ep11_kblob2protkey(card, dom, key, keylen, protkey, protkeylen, protkeytype); if (rc == 0) break; @@ -496,7 +496,7 @@ try_via_ep11: tmpbuf, &tmpbuflen); if (rc) goto failure; - rc = pkey_ep11key2pkey(tmpbuf, + rc = pkey_ep11key2pkey(tmpbuf, tmpbuflen, protkey, protkeylen, protkeytype); if (!rc) goto out; @@ -612,7 +612,7 @@ static int pkey_nonccatok2pkey(const u8 *key, u32 keylen, rc = ep11_check_aes_key(debug_info, 3, key, keylen, 1); if (rc) goto out; - rc = pkey_ep11key2pkey(key, + rc = pkey_ep11key2pkey(key, keylen, protkey, protkeylen, protkeytype); break; } @@ -621,7 +621,7 @@ static int pkey_nonccatok2pkey(const u8 *key, u32 keylen, rc = ep11_check_aes_key_with_hdr(debug_info, 3, key, keylen, 1); if (rc) goto out; - rc = pkey_ep11key2pkey(key + sizeof(struct ep11kblob_header), + rc = pkey_ep11key2pkey(key, keylen, protkey, protkeylen, protkeytype); break; default: @@ -963,10 +963,12 @@ static int pkey_keyblob2pkey2(const struct pkey_apqn *apqns, size_t nr_apqns, } } else if (hdr->type == TOKTYPE_NON_CCA) { if (hdr->version == TOKVER_EP11_AES) { - if (keylen < sizeof(struct ep11keyblob)) - return -EINVAL; if (ep11_check_aes_key(debug_info, 3, key, keylen, 1)) return -EINVAL; + } else if (hdr->version == TOKVER_EP11_AES_WITH_HEADER) { + if (ep11_check_aes_key_with_hdr(debug_info, 3, + key, keylen, 1)) + return -EINVAL; } else { return pkey_nonccatok2pkey(key, keylen, protkey, protkeylen, @@ -994,10 +996,7 @@ static int pkey_keyblob2pkey2(const struct pkey_apqn *apqns, size_t nr_apqns, protkey, protkeylen, protkeytype); } else { - /* EP11 AES secure key blob */ - struct ep11keyblob *kb = (struct ep11keyblob *)key; - - rc = ep11_kblob2protkey(card, dom, key, kb->head.len, + rc = ep11_kblob2protkey(card, dom, key, keylen, protkey, protkeylen, protkeytype); } @@ -1257,12 +1256,14 @@ static int pkey_keyblob2pkey3(const struct pkey_apqn *apqns, size_t nr_apqns, hdr->version == TOKVER_EP11_ECC_WITH_HEADER) && is_ep11_keyblob(key + sizeof(struct ep11kblob_header))) rc = ep11_kblob2protkey(card, dom, key, hdr->len, - protkey, protkeylen, protkeytype); + protkey, protkeylen, + protkeytype); else if (hdr->type == TOKTYPE_NON_CCA && hdr->version == TOKVER_EP11_AES && is_ep11_keyblob(key)) rc = ep11_kblob2protkey(card, dom, key, hdr->len, - protkey, protkeylen, protkeytype); + protkey, protkeylen, + protkeytype); else if (hdr->type == TOKTYPE_CCA_INTERNAL && hdr->version == TOKVER_CCA_AES) rc = cca_sec2protkey(card, dom, key, protkey, diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c index 355d30bc0aac..669ad6f5d5b0 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.c +++ b/drivers/s390/crypto/zcrypt_ep11misc.c @@ -157,6 +157,65 @@ out: return rc; } +static int ep11_kb_decode(const u8 *kb, size_t kblen, + struct ep11kblob_header **kbhdr, size_t *kbhdrsize, + struct ep11keyblob **kbpl, size_t *kbplsize) +{ + struct ep11kblob_header *tmph, *hdr = NULL; + size_t hdrsize = 0, plsize = 0; + struct ep11keyblob *pl = NULL; + int rc = -EINVAL; + u8 *tmpp; + + if (kblen < sizeof(struct ep11kblob_header)) + goto out; + tmph = (struct ep11kblob_header *)kb; + + if (tmph->type != TOKTYPE_NON_CCA && + tmph->len > kblen) + goto out; + + if (ep11_kb_split(kb, kblen, tmph->version, + &hdr, &hdrsize, &tmpp, &plsize)) + goto out; + + if (plsize < sizeof(struct ep11keyblob)) + goto out; + + if (!is_ep11_keyblob(tmpp)) + goto out; + + pl = (struct ep11keyblob *)tmpp; + plsize = hdr->len - hdrsize; + + if (kbhdr) + *kbhdr = hdr; + if (kbhdrsize) + *kbhdrsize = hdrsize; + if (kbpl) + *kbpl = pl; + if (kbplsize) + *kbplsize = plsize; + + rc = 0; +out: + return rc; +} + +/* + * For valid ep11 keyblobs, returns a reference to the wrappingkey verification + * pattern. Otherwise NULL. + */ +const u8 *ep11_kb_wkvp(const u8 *keyblob, size_t keybloblen) +{ + struct ep11keyblob *kb; + + if (ep11_kb_decode(keyblob, keybloblen, NULL, NULL, &kb, NULL)) + return NULL; + return kb->wkvp; +} +EXPORT_SYMBOL(ep11_kb_wkvp); + /* * Simple check if the key blob is a valid EP11 AES key blob with header. */ @@ -1170,10 +1229,10 @@ static int ep11_unwrapkey(u16 card, u16 domain, return 0; } -static int ep11_wrapkey(u16 card, u16 domain, - const u8 *key, size_t keysize, - u32 mech, const u8 *iv, - u8 *databuf, size_t *datasize) +static int _ep11_wrapkey(u16 card, u16 domain, + const u8 *key, size_t keysize, + u32 mech, const u8 *iv, + u8 *databuf, size_t *datasize) { struct wk_req_pl { struct pl_head head; @@ -1203,20 +1262,10 @@ static int ep11_wrapkey(u16 card, u16 domain, struct ep11_cprb *req = NULL, *rep = NULL; struct ep11_target_dev target; struct ep11_urb *urb = NULL; - struct ep11keyblob *kb; size_t req_pl_size; int api, rc = -ENOMEM; - bool has_header = false; u8 *p; - /* maybe the session field holds a header with key info */ - kb = (struct ep11keyblob *)key; - if (kb->head.type == TOKTYPE_NON_CCA && - kb->head.version == TOKVER_EP11_AES) { - has_header = true; - keysize = min_t(size_t, kb->head.len, keysize); - } - /* request cprb and payload */ req_pl_size = sizeof(struct wk_req_pl) + (iv ? 16 : 0) + ASN1TAGLEN(keysize) + 4; @@ -1241,11 +1290,6 @@ static int ep11_wrapkey(u16 card, u16 domain, } /* key blob */ p += asn1tag_write(p, 0x04, key, keysize); - /* maybe the key argument needs the head data cleaned out */ - if (has_header) { - kb = (struct ep11keyblob *)(p - keysize); - memset(&kb->head, 0, sizeof(kb->head)); - } /* empty kek tag */ *p++ = 0x04; *p++ = 0; @@ -1366,11 +1410,12 @@ out: } EXPORT_SYMBOL(ep11_clr2keyblob); -int ep11_kblob2protkey(u16 card, u16 dom, const u8 *keyblob, size_t keybloblen, +int ep11_kblob2protkey(u16 card, u16 dom, + const u8 *keyblob, size_t keybloblen, u8 *protkey, u32 *protkeylen, u32 *protkeytype) { - int rc = -EIO; - u8 *wkbuf = NULL; + struct ep11kblob_header *hdr; + struct ep11keyblob *key; size_t wkbuflen, keylen; struct wk_info { u16 version; @@ -1381,31 +1426,17 @@ int ep11_kblob2protkey(u16 card, u16 dom, const u8 *keyblob, size_t keybloblen, u8 res2[8]; u8 pkey[]; } __packed * wki; - const u8 *key; - struct ep11kblob_header *hdr; + u8 *wkbuf = NULL; + int rc = -EIO; - /* key with or without header ? */ - hdr = (struct ep11kblob_header *)keyblob; - if (hdr->type == TOKTYPE_NON_CCA && - (hdr->version == TOKVER_EP11_AES_WITH_HEADER || - hdr->version == TOKVER_EP11_ECC_WITH_HEADER) && - is_ep11_keyblob(keyblob + sizeof(struct ep11kblob_header))) { - /* EP11 AES or ECC key with header */ - key = keyblob + sizeof(struct ep11kblob_header); - keylen = hdr->len - sizeof(struct ep11kblob_header); - } else if (hdr->type == TOKTYPE_NON_CCA && - hdr->version == TOKVER_EP11_AES && - is_ep11_keyblob(keyblob)) { - /* EP11 AES key (old style) */ - key = keyblob; - keylen = hdr->len; - } else if (is_ep11_keyblob(keyblob)) { - /* raw EP11 key blob */ - key = keyblob; - keylen = keybloblen; - } else { + if (ep11_kb_decode((u8 *)keyblob, keybloblen, &hdr, NULL, &key, &keylen)) return -EINVAL; + + if (hdr->version == TOKVER_EP11_AES) { + /* wipe overlayed header */ + memset(hdr, 0, sizeof(*hdr)); } + /* !!! hdr is no longer a valid header !!! */ /* alloc temp working buffer */ wkbuflen = (keylen + AES_BLOCK_SIZE) & (~(AES_BLOCK_SIZE - 1)); @@ -1414,8 +1445,8 @@ int ep11_kblob2protkey(u16 card, u16 dom, const u8 *keyblob, size_t keybloblen, return -ENOMEM; /* ep11 secure key -> protected key + info */ - rc = ep11_wrapkey(card, dom, key, keylen, - 0, def_iv, wkbuf, &wkbuflen); + rc = _ep11_wrapkey(card, dom, (u8 *)key, keylen, + 0, def_iv, wkbuf, &wkbuflen); if (rc) { DEBUG_ERR( "%s rewrapping ep11 key to pkey failed, rc=%d\n", diff --git a/drivers/s390/crypto/zcrypt_ep11misc.h b/drivers/s390/crypto/zcrypt_ep11misc.h index b611cf64231d..a0de1cccebbe 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.h +++ b/drivers/s390/crypto/zcrypt_ep11misc.h @@ -48,6 +48,12 @@ static inline bool is_ep11_keyblob(const u8 *key) return (kb->version == EP11_STRUCT_MAGIC); } +/* + * For valid ep11 keyblobs, returns a reference to the wrappingkey verification + * pattern. Otherwise NULL. + */ +const u8 *ep11_kb_wkvp(const u8 *kblob, size_t kbloblen); + /* * Simple check if the key blob is a valid EP11 AES key blob with header. * If checkcpacfexport is enabled, the key is also checked for the -- cgit v1.2.3 From 745742dbca11a1b63684ec7032a81aaedcf51fb0 Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Tue, 25 Jul 2023 13:05:36 +0200 Subject: s390/pkey: fix PKEY_TYPE_EP11_AES handling in PKEY_VERIFYKEY2 IOCTL Commit 'fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys")' introduced a new PKEY_TYPE_EP11_AES type for the PKEY_VERIFYKEY2 IOCTL to verify keyblobs of this type. Unfortunately, all PKEY_VERIFYKEY2 IOCTL requests with keyblobs of this type return with an error (-EINVAL). Fix PKEY_TYPE_EP11_AES handling in PKEY_VERIFYKEY2 IOCTL, so that userspace can verify keyblobs of this type. Fixes: fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys") Signed-off-by: Holger Dengler Reviewed-by: Ingo Franzki Signed-off-by: Heiko Carstens --- drivers/s390/crypto/pkey_api.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index 75d7f0d5f14e..8d6f35ccc561 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -912,7 +912,8 @@ static int pkey_verifykey2(const u8 *key, size_t keylen, *ksize = kb->head.bitlen; rc = ep11_findcard2(&_apqns, &_nr_apqns, *cardnr, *domain, - ZCRYPT_CEX7, EP11_API_V, kb->wkvp); + ZCRYPT_CEX7, EP11_API_V, + ep11_kb_wkvp(key, keylen)); if (rc) goto out; @@ -922,6 +923,30 @@ static int pkey_verifykey2(const u8 *key, size_t keylen, *cardnr = ((struct pkey_apqn *)_apqns)->card; *domain = ((struct pkey_apqn *)_apqns)->domain; + } else if (hdr->type == TOKTYPE_NON_CCA && + hdr->version == TOKVER_EP11_AES_WITH_HEADER) { + struct ep11kblob_header *kh = (struct ep11kblob_header *)key; + + rc = ep11_check_aes_key_with_hdr(debug_info, 3, + key, keylen, 1); + if (rc) + goto out; + if (ktype) + *ktype = PKEY_TYPE_EP11_AES; + if (ksize) + *ksize = kh->bitlen; + + rc = ep11_findcard2(&_apqns, &_nr_apqns, *cardnr, *domain, + ZCRYPT_CEX7, EP11_API_V, + ep11_kb_wkvp(key, keylen)); + if (rc) + goto out; + + if (flags) + *flags = PKEY_FLAGS_MATCH_CUR_MKVP; + + *cardnr = ((struct pkey_apqn *)_apqns)->card; + *domain = ((struct pkey_apqn *)_apqns)->domain; } else { rc = -EINVAL; } -- cgit v1.2.3 From b9352e4b9b9eff949bcc6907b8569b3a1d992f1e Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Fri, 4 Aug 2023 16:02:58 +0200 Subject: s390/pkey: fix PKEY_TYPE_EP11_AES handling for sysfs attributes Commit 'fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys")' introduced a new PKEY_TYPE_EP11_AES securekey type as a supplement to the existing PKEY_TYPE_EP11 (which won't work in environments with session-bound keys). The pkey EP11 securekey attributes use PKEY_TYPE_EP11_AES (instead of PKEY_TYPE_EP11) keyblobs, to make the generated keyblobs usable also in environments, where session-bound keys are required. There should be no negative impacts to userspace because the internal structure of the keyblobs is opaque. The increased size of the generated keyblobs is reflected by the changed size of the attributes. Fixes: fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys") Signed-off-by: Holger Dengler Reviewed-by: Ingo Franzki Signed-off-by: Heiko Carstens --- arch/s390/include/uapi/asm/pkey.h | 2 +- drivers/s390/crypto/pkey_api.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h index 5faf0a1d2c16..5ad76471e73f 100644 --- a/arch/s390/include/uapi/asm/pkey.h +++ b/arch/s390/include/uapi/asm/pkey.h @@ -26,7 +26,7 @@ #define MAXCLRKEYSIZE 32 /* a clear key value may be up to 32 bytes */ #define MAXAESCIPHERKEYSIZE 136 /* our aes cipher keys have always 136 bytes */ #define MINEP11AESKEYBLOBSIZE 256 /* min EP11 AES key blob size */ -#define MAXEP11AESKEYBLOBSIZE 320 /* max EP11 AES key blob size */ +#define MAXEP11AESKEYBLOBSIZE 336 /* max EP11 AES key blob size */ /* Minimum size of a key blob */ #define MINKEYBLOBSIZE SECKEYBLOBSIZE diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index 8d6f35ccc561..396a159afdf5 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -2142,7 +2142,7 @@ static struct attribute_group ccacipher_attr_group = { * (i.e. off != 0 or count < key blob size) -EINVAL is returned. * This function and the sysfs attributes using it provide EP11 key blobs * padded to the upper limit of MAXEP11AESKEYBLOBSIZE which is currently - * 320 bytes. + * 336 bytes. */ static ssize_t pkey_ep11_aes_attr_read(enum pkey_key_size keybits, bool is_xts, char *buf, loff_t off, @@ -2171,7 +2171,7 @@ static ssize_t pkey_ep11_aes_attr_read(enum pkey_key_size keybits, card = apqns[i] >> 16; dom = apqns[i] & 0xFFFF; rc = ep11_genaeskey(card, dom, keybits, 0, buf, &keysize, - PKEY_TYPE_EP11); + PKEY_TYPE_EP11_AES); if (rc == 0) break; } @@ -2182,7 +2182,7 @@ static ssize_t pkey_ep11_aes_attr_read(enum pkey_key_size keybits, keysize = MAXEP11AESKEYBLOBSIZE; buf += MAXEP11AESKEYBLOBSIZE; rc = ep11_genaeskey(card, dom, keybits, 0, buf, &keysize, - PKEY_TYPE_EP11); + PKEY_TYPE_EP11_AES); if (rc == 0) return 2 * MAXEP11AESKEYBLOBSIZE; } -- cgit v1.2.3 From cba33db3fc4dbf2e54294b0e499d2335a3a00d78 Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Wed, 9 Aug 2023 14:23:45 +0200 Subject: s390/paes: fix PKEY_TYPE_EP11_AES handling for secure keyblobs Commit 'fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys")' introduced PKEY_TYPE_EP11_AES securekey blobs as a supplement to the PKEY_TYPE_EP11 (which won't work in environments with session-bound keys). This new keyblobs has a different maximum size, so fix paes crypto module to accept also these larger keyblobs. Fixes: fa6999e326fe ("s390/pkey: support CCA and EP11 secure ECC private keys") Signed-off-by: Holger Dengler Reviewed-by: Ingo Franzki Signed-off-by: Heiko Carstens --- arch/s390/crypto/paes_s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c index 38349150c96e..8b541e44151d 100644 --- a/arch/s390/crypto/paes_s390.c +++ b/arch/s390/crypto/paes_s390.c @@ -35,7 +35,7 @@ * and padding is also possible, the limits need to be generous. */ #define PAES_MIN_KEYSIZE 16 -#define PAES_MAX_KEYSIZE 320 +#define PAES_MAX_KEYSIZE MAXEP11AESKEYBLOBSIZE static u8 *ctrblk; static DEFINE_MUTEX(ctrblk_lock); -- cgit v1.2.3 From 386cb81e4ba7811573765aaaeb91b472639c2bae Mon Sep 17 00:00:00 2001 From: Holger Dengler Date: Fri, 11 Aug 2023 16:56:20 +0200 Subject: s390/zcrypt_ep11misc: support API ordinal 6 with empty pin-blob Secure execution guest environments require an empty pinblob in all key generation and unwrap requests. Empty pinblobs are only available in EP11 API ordinal 6 or higher. Add an empty pinblob to key generation and unwrap requests, if the AP secure binding facility is available. In all other cases, stay with the empty pin tag (no pinblob) and the current API ordinals. The EP11 API ordinal also needs to be considered when the pkey module tries to figure out the list of eligible cards for key operations with protected keys in secure execution environment. These changes are transparent to userspace but required for running an secure execution guest with handling key generate and key derive (e.g. secure key to protected key) correct. Especially using EP11 secure keys with the kernel dm-crypt layer requires this patch. Co-developed-by: Harald Freudenberger Signed-off-by: Harald Freudenberger Signed-off-by: Holger Dengler Reviewed-by: Ingo Franzki Signed-off-by: Heiko Carstens --- drivers/s390/crypto/ap_bus.c | 9 ++++++ drivers/s390/crypto/ap_bus.h | 1 + drivers/s390/crypto/pkey_api.c | 27 +++++++++++----- drivers/s390/crypto/zcrypt_ep11misc.c | 60 +++++++++++++++++++++++++---------- drivers/s390/crypto/zcrypt_ep11misc.h | 4 ++- 5 files changed, 76 insertions(+), 25 deletions(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index b1d2fedea086..339812efe822 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -218,6 +218,15 @@ int ap_sb_available(void) return 0; } +/* + * ap_is_se_guest(): Check for SE guest with AP pass-through support. + */ +bool ap_is_se_guest(void) +{ + return is_prot_virt_guest() && ap_sb_available(); +} +EXPORT_SYMBOL(ap_is_se_guest); + /* * ap_fetch_qci_info(): Fetch cryptographic config info * diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 47bbe9babc59..be54b070c031 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -274,6 +274,7 @@ void ap_flush_queue(struct ap_queue *aq); void *ap_airq_ptr(void); int ap_sb_available(void); +bool ap_is_se_guest(void); void ap_wait(enum ap_sm_wait wait); void ap_request_timeout(struct timer_list *t); void ap_bus_force_rescan(void); diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index 396a159afdf5..6cfb6b2340c9 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -263,7 +263,9 @@ static int pkey_clr2ep11key(const u8 *clrkey, size_t clrkeylen, /* build a list of apqns suitable for ep11 keys with cpacf support */ rc = ep11_findcard2(&apqns, &nr_apqns, 0xFFFF, 0xFFFF, - ZCRYPT_CEX7, EP11_API_V, NULL); + ZCRYPT_CEX7, + ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4, + NULL); if (rc) goto out; @@ -299,7 +301,8 @@ static int pkey_ep11key2pkey(const u8 *key, size_t keylen, /* build a list of apqns suitable for this key */ rc = ep11_findcard2(&apqns, &nr_apqns, 0xFFFF, 0xFFFF, - ZCRYPT_CEX7, EP11_API_V, + ZCRYPT_CEX7, + ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4, ep11_kb_wkvp(key, keylen)); if (rc) goto out; @@ -902,6 +905,7 @@ static int pkey_verifykey2(const u8 *key, size_t keylen, } else if (hdr->type == TOKTYPE_NON_CCA && hdr->version == TOKVER_EP11_AES) { struct ep11keyblob *kb = (struct ep11keyblob *)key; + int api; rc = ep11_check_aes_key(debug_info, 3, key, keylen, 1); if (rc) @@ -911,8 +915,9 @@ static int pkey_verifykey2(const u8 *key, size_t keylen, if (ksize) *ksize = kb->head.bitlen; + api = ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4; rc = ep11_findcard2(&_apqns, &_nr_apqns, *cardnr, *domain, - ZCRYPT_CEX7, EP11_API_V, + ZCRYPT_CEX7, api, ep11_kb_wkvp(key, keylen)); if (rc) goto out; @@ -926,6 +931,7 @@ static int pkey_verifykey2(const u8 *key, size_t keylen, } else if (hdr->type == TOKTYPE_NON_CCA && hdr->version == TOKVER_EP11_AES_WITH_HEADER) { struct ep11kblob_header *kh = (struct ep11kblob_header *)key; + int api; rc = ep11_check_aes_key_with_hdr(debug_info, 3, key, keylen, 1); @@ -936,8 +942,9 @@ static int pkey_verifykey2(const u8 *key, size_t keylen, if (ksize) *ksize = kh->bitlen; + api = ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4; rc = ep11_findcard2(&_apqns, &_nr_apqns, *cardnr, *domain, - ZCRYPT_CEX7, EP11_API_V, + ZCRYPT_CEX7, api, ep11_kb_wkvp(key, keylen)); if (rc) goto out; @@ -1056,7 +1063,7 @@ static int pkey_apqns4key(const u8 *key, size_t keylen, u32 flags, return -EINVAL; if (kb->attr & EP11_BLOB_PKEY_EXTRACTABLE) { minhwtype = ZCRYPT_CEX7; - api = EP11_API_V; + api = ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4; } rc = ep11_findcard2(&_apqns, &_nr_apqns, 0xFFFF, 0xFFFF, minhwtype, api, kb->wkvp); @@ -1072,7 +1079,7 @@ static int pkey_apqns4key(const u8 *key, size_t keylen, u32 flags, return -EINVAL; if (kb->attr & EP11_BLOB_PKEY_EXTRACTABLE) { minhwtype = ZCRYPT_CEX7; - api = EP11_API_V; + api = ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4; } rc = ep11_findcard2(&_apqns, &_nr_apqns, 0xFFFF, 0xFFFF, minhwtype, api, kb->wkvp); @@ -1182,11 +1189,13 @@ static int pkey_apqns4keytype(enum pkey_key_type ktype, ktype == PKEY_TYPE_EP11_AES || ktype == PKEY_TYPE_EP11_ECC) { u8 *wkvp = NULL; + int api; if (flags & PKEY_FLAGS_MATCH_CUR_MKVP) wkvp = cur_mkvp; + api = ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4; rc = ep11_findcard2(&_apqns, &_nr_apqns, 0xFFFF, 0xFFFF, - ZCRYPT_CEX7, EP11_API_V, wkvp); + ZCRYPT_CEX7, api, wkvp); if (rc) goto out; @@ -2160,7 +2169,9 @@ static ssize_t pkey_ep11_aes_attr_read(enum pkey_key_size keybits, /* build a list of apqns able to generate an cipher key */ rc = ep11_findcard2(&apqns, &nr_apqns, 0xFFFF, 0xFFFF, - ZCRYPT_CEX7, EP11_API_V, NULL); + ZCRYPT_CEX7, + ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4, + NULL); if (rc) return rc; diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c index 669ad6f5d5b0..0a877f9792c2 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.c +++ b/drivers/s390/crypto/zcrypt_ep11misc.c @@ -29,6 +29,8 @@ #define DEBUG_WARN(...) ZCRYPT_DBF(DBF_WARN, ##__VA_ARGS__) #define DEBUG_ERR(...) ZCRYPT_DBF(DBF_ERR, ##__VA_ARGS__) +#define EP11_PINBLOB_V1_BYTES 56 + /* default iv used here */ static const u8 def_iv[16] = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }; @@ -592,7 +594,7 @@ static int ep11_query_info(u16 cardnr, u16 domain, u32 query_type, struct ep11_cprb *req = NULL, *rep = NULL; struct ep11_target_dev target; struct ep11_urb *urb = NULL; - int api = 1, rc = -ENOMEM; + int api = EP11_API_V1, rc = -ENOMEM; /* request cprb and payload */ req = alloc_cprb(sizeof(struct ep11_info_req_pl)); @@ -789,8 +791,7 @@ static int _ep11_genaeskey(u16 card, u16 domain, u32 attr_bool_bits; u32 attr_val_len_type; u32 attr_val_len_value; - u8 pin_tag; - u8 pin_len; + /* followed by empty pin tag or empty pinblob tag */ } __packed * req_pl; struct keygen_rep_pl { struct pl_head head; @@ -803,9 +804,11 @@ static int _ep11_genaeskey(u16 card, u16 domain, u8 data[512]; } __packed * rep_pl; struct ep11_cprb *req = NULL, *rep = NULL; + size_t req_pl_size, pinblob_size = 0; struct ep11_target_dev target; struct ep11_urb *urb = NULL; int api, rc = -ENOMEM; + u8 *p; switch (keybitsize) { case 128: @@ -821,12 +824,22 @@ static int _ep11_genaeskey(u16 card, u16 domain, } /* request cprb and payload */ - req = alloc_cprb(sizeof(struct keygen_req_pl)); + api = (!keygenflags || keygenflags & 0x00200000) ? + EP11_API_V4 : EP11_API_V1; + if (ap_is_se_guest()) { + /* + * genkey within SE environment requires API ordinal 6 + * with empty pinblob + */ + api = EP11_API_V6; + pinblob_size = EP11_PINBLOB_V1_BYTES; + } + req_pl_size = sizeof(struct keygen_req_pl) + ASN1TAGLEN(pinblob_size); + req = alloc_cprb(req_pl_size); if (!req) goto out; req_pl = (struct keygen_req_pl *)(((u8 *)req) + sizeof(*req)); - api = (!keygenflags || keygenflags & 0x00200000) ? 4 : 1; - prep_head(&req_pl->head, sizeof(*req_pl), api, 21); /* GenerateKey */ + prep_head(&req_pl->head, req_pl_size, api, 21); /* GenerateKey */ req_pl->var_tag = 0x04; req_pl->var_len = sizeof(u32); req_pl->keybytes_tag = 0x04; @@ -842,7 +855,10 @@ static int _ep11_genaeskey(u16 card, u16 domain, req_pl->attr_bool_bits = keygenflags ? keygenflags : KEY_ATTR_DEFAULTS; req_pl->attr_val_len_type = 0x00000161; /* CKA_VALUE_LEN */ req_pl->attr_val_len_value = keybitsize / 8; - req_pl->pin_tag = 0x04; + p = ((u8 *)req_pl) + sizeof(*req_pl); + /* pin tag */ + *p++ = 0x04; + *p++ = pinblob_size; /* reply cprb and payload */ rep = alloc_cprb(sizeof(struct keygen_rep_pl)); @@ -857,7 +873,7 @@ static int _ep11_genaeskey(u16 card, u16 domain, target.ap_id = card; target.dom_id = domain; prep_urb(urb, &target, 1, - req, sizeof(*req) + sizeof(*req_pl), + req, sizeof(*req) + req_pl_size, rep, sizeof(*rep) + sizeof(*rep_pl)); rc = zcrypt_send_ep11_cprb(urb); @@ -965,7 +981,7 @@ static int ep11_cryptsingle(u16 card, u16 domain, struct ep11_target_dev target; struct ep11_urb *urb = NULL; size_t req_pl_size, rep_pl_size; - int n, api = 1, rc = -ENOMEM; + int n, api = EP11_API_V1, rc = -ENOMEM; u8 *p; /* the simple asn1 coding used has length limits */ @@ -1084,7 +1100,7 @@ static int _ep11_unwrapkey(u16 card, u16 domain, * maybe followed by iv data * followed by kek tag + kek blob * followed by empty mac tag - * followed by empty pin tag + * followed by empty pin tag or empty pinblob tag * followed by encryted key tag + bytes */ } __packed * req_pl; @@ -1099,20 +1115,30 @@ static int _ep11_unwrapkey(u16 card, u16 domain, u8 data[512]; } __packed * rep_pl; struct ep11_cprb *req = NULL, *rep = NULL; + size_t req_pl_size, pinblob_size = 0; struct ep11_target_dev target; struct ep11_urb *urb = NULL; - size_t req_pl_size; int api, rc = -ENOMEM; u8 *p; /* request cprb and payload */ + api = (!keygenflags || keygenflags & 0x00200000) ? + EP11_API_V4 : EP11_API_V1; + if (ap_is_se_guest()) { + /* + * unwrap within SE environment requires API ordinal 6 + * with empty pinblob + */ + api = EP11_API_V6; + pinblob_size = EP11_PINBLOB_V1_BYTES; + } req_pl_size = sizeof(struct uw_req_pl) + (iv ? 16 : 0) - + ASN1TAGLEN(keksize) + 4 + ASN1TAGLEN(enckeysize); + + ASN1TAGLEN(keksize) + ASN1TAGLEN(0) + + ASN1TAGLEN(pinblob_size) + ASN1TAGLEN(enckeysize); req = alloc_cprb(req_pl_size); if (!req) goto out; req_pl = (struct uw_req_pl *)(((u8 *)req) + sizeof(*req)); - api = (!keygenflags || keygenflags & 0x00200000) ? 4 : 1; prep_head(&req_pl->head, req_pl_size, api, 34); /* UnwrapKey */ req_pl->attr_tag = 0x04; req_pl->attr_len = 7 * sizeof(u32); @@ -1137,9 +1163,10 @@ static int _ep11_unwrapkey(u16 card, u16 domain, /* empty mac key tag */ *p++ = 0x04; *p++ = 0; - /* empty pin tag */ + /* pin tag */ *p++ = 0x04; - *p++ = 0; + *p++ = pinblob_size; + p += pinblob_size; /* encrypted key value tag and bytes */ p += asn1tag_write(p, 0x04, enckey, enckeysize); @@ -1275,7 +1302,8 @@ static int _ep11_wrapkey(u16 card, u16 domain, if (!mech || mech == 0x80060001) req->flags |= 0x20; /* CPACF_WRAP needs special bit */ req_pl = (struct wk_req_pl *)(((u8 *)req) + sizeof(*req)); - api = (!mech || mech == 0x80060001) ? 4 : 1; /* CKM_IBM_CPACF_WRAP */ + api = (!mech || mech == 0x80060001) ? /* CKM_IBM_CPACF_WRAP */ + EP11_API_V4 : EP11_API_V1; prep_head(&req_pl->head, req_pl_size, api, 33); /* WrapKey */ req_pl->var_tag = 0x04; req_pl->var_len = sizeof(u32); diff --git a/drivers/s390/crypto/zcrypt_ep11misc.h b/drivers/s390/crypto/zcrypt_ep11misc.h index a0de1cccebbe..9d17fd5228a7 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.h +++ b/drivers/s390/crypto/zcrypt_ep11misc.h @@ -12,7 +12,9 @@ #include #include -#define EP11_API_V 4 /* highest known and supported EP11 API version */ +#define EP11_API_V1 1 /* min EP11 API, default if no higher api required */ +#define EP11_API_V4 4 /* supported EP11 API for the ep11misc cprbs */ +#define EP11_API_V6 6 /* min EP11 API for some cprbs in SE environment */ #define EP11_STRUCT_MAGIC 0x1234 #define EP11_BLOB_PKEY_EXTRACTABLE 0x00200000 -- cgit v1.2.3 From 979fe44af819d76fc02a5a97ad1cb74f7d83578e Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 16 Aug 2023 15:29:42 +0200 Subject: s390/ipl: fix virtual vs physical address confusion The value of ipl_cert_list_addr boot variable contains a physical address, which is used directly. That works because virtual and physical address spaces are currently the same, but otherwise it is wrong. While at it, fix also a comment for the platform keyring. Signed-off-by: Alexander Gordeev Reviewed-by: Mimi Zohar Acked-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20230816132942.2540411-1-agordeev@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/kernel/machine_kexec_file.c | 4 ++-- arch/s390/kernel/setup.c | 2 +- security/integrity/platform_certs/load_ipl_s390.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 2df94d32140c..8d207b82d9fe 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -188,7 +188,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; - ptr = (void *)ipl_cert_list_addr; + ptr = __va(ipl_cert_list_addr); end = ptr + ipl_cert_list_size; ncerts = 0; while (ptr < end) { @@ -200,7 +200,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, addr = data->memsz + data->report->size; addr += ncerts * sizeof(struct ipl_rb_certificate_entry); - ptr = (void *)ipl_cert_list_addr; + ptr = __va(ipl_cert_list_addr); while (ptr < end) { len = *(unsigned int *)ptr; ptr += sizeof(len); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 393dd8385506..c744104e4a9c 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -875,7 +875,7 @@ static void __init log_component_list(void) pr_info("Linux is running with Secure-IPL enabled\n"); else pr_info("Linux is running with Secure-IPL disabled\n"); - ptr = (void *) early_ipl_comp_list_addr; + ptr = __va(early_ipl_comp_list_addr); end = (void *) ptr + early_ipl_comp_list_size; pr_info("The IPL report contains the following components:\n"); while (ptr < end) { diff --git a/security/integrity/platform_certs/load_ipl_s390.c b/security/integrity/platform_certs/load_ipl_s390.c index e769dcb7ea94..c7c381a9ddaa 100644 --- a/security/integrity/platform_certs/load_ipl_s390.c +++ b/security/integrity/platform_certs/load_ipl_s390.c @@ -22,8 +22,8 @@ static int __init load_ipl_certs(void) if (!ipl_cert_list_addr) return 0; - /* Copy the certificates to the system keyring */ - ptr = (void *) ipl_cert_list_addr; + /* Copy the certificates to the platform keyring */ + ptr = __va(ipl_cert_list_addr); end = ptr + ipl_cert_list_size; while ((void *) ptr < end) { len = *(unsigned int *) ptr; -- cgit v1.2.3 From cfd012107f11ec4af010f11eca341edc831abf6c Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Fri, 11 Aug 2023 21:56:15 +0000 Subject: s390/ipl: refactor deprecated strncpy `strncpy` is deprecated for use on NUL-terminated destination strings [1]. Use `strscpy` which has the same behavior as `strncpy` here with the extra safeguard of guaranteeing NUL-termination of destination strings. In it's current form, this may result in silent truncation if the src string has the same size as the destination string. [hca@linux.ibm.com: use strscpy() instead of strscpy_pad()] Link: www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings[1] Link: https://github.com/KSPP/linux/issues/90 Cc: linux-hardening@vger.kernel.org Signed-off-by: Justin Stitt Link: https://lore.kernel.org/r/20230811-arch-s390-kernel-v1-1-7edbeeab3809@google.com Signed-off-by: Heiko Carstens --- arch/s390/kernel/ipl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 7f3a84e414a4..05e51666db03 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -266,7 +266,7 @@ static ssize_t sys_##_prefix##_##_name##_store(struct kobject *kobj, \ struct kobj_attribute *attr, \ const char *buf, size_t len) \ { \ - strncpy(_value, buf, sizeof(_value) - 1); \ + strscpy(_value, buf, sizeof(_value)); \ strim(_value); \ return len; \ } \ -- cgit v1.2.3 From 680b7ddd7e2ab7638d431722432f6d02d75dade1 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 15 Aug 2023 14:43:22 -0400 Subject: s390/vfio-ap: no need to check the 'E' and 'I' bits in APQSW after TAPQ After a ZAPQ is executed to reset a queue, if the queue is not empty or interrupts are still enabled, the vfio_ap driver will wait for the reset operation to complete by repeatedly executing the TAPQ instruction and checking the 'E' and 'I' bits in the APQSW to verify that the queue is empty and interrupts are disabled. This is unnecessary because it is sufficient to check only the response code in the APQSW. If the reset is still in progress, the response code will be 02; however, if the reset has completed successfully, the response code will be 00. Signed-off-by: Tony Krowiak Acked-by: Janosch Frank Tested-by: Viktor Mihajlovski Link: https://lore.kernel.org/r/20230815184333.6554-2-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens --- drivers/s390/crypto/vfio_ap_ops.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index b441745b0418..3fd80533194b 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1608,19 +1608,10 @@ static int apq_status_check(int apqn, struct ap_queue_status *status) { switch (status->response_code) { case AP_RESPONSE_NORMAL: - case AP_RESPONSE_RESET_IN_PROGRESS: - if (status->queue_empty && !status->irq_enabled) - return 0; - return -EBUSY; case AP_RESPONSE_DECONFIGURED: - /* - * If the AP queue is deconfigured, any subsequent AP command - * targeting the queue will fail with the same response code. On the - * other hand, when an AP adapter is deconfigured, the associated - * queues are reset, so let's return a value indicating the reset - * for which we're waiting completed successfully. - */ return 0; + case AP_RESPONSE_RESET_IN_PROGRESS: + return -EBUSY; default: WARN(true, "failed to verify reset of queue %02x.%04x: TAPQ rc=%u\n", -- cgit v1.2.3 From 7aa7b2a80cb70d528785f06a54d6c8148826006d Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 15 Aug 2023 14:43:23 -0400 Subject: s390/vfio-ap: clean up irq resources if possible The architecture does not specify whether interrupts are disabled as part of the asynchronous reset or upon return from the PQAP/ZAPQ instruction. If, however, PQAP/ZAPQ completes with APQSW response code 0 and the interrupt bit in the status word is also 0, we know the interrupts are disabled and we can go ahead and clean up the corresponding resources; otherwise, we must wait until the asynchronous reset has completed. Signed-off-by: Tony Krowiak Suggested-by: Halil Pasic Reviewed-by: Jason J. Herne Acked-by: Halil Pasic Acked-by: Janosch Frank Tested-by: Viktor Mihajlovski Link: https://lore.kernel.org/r/20230815184333.6554-3-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens --- drivers/s390/crypto/vfio_ap_ops.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 3fd80533194b..be92ba45226d 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1654,9 +1654,13 @@ retry_zapq: switch (status.response_code) { case AP_RESPONSE_NORMAL: ret = 0; - /* if the reset has not completed, wait for it to take effect */ - if (!status.queue_empty || status.irq_enabled) + if (!status.irq_enabled) + vfio_ap_free_aqic_resources(q); + if (!status.queue_empty || status.irq_enabled) { ret = apq_reset_check(q); + if (status.irq_enabled && ret == 0) + vfio_ap_free_aqic_resources(q); + } break; case AP_RESPONSE_RESET_IN_PROGRESS: /* @@ -1675,6 +1679,7 @@ retry_zapq: * completed successfully. */ ret = 0; + vfio_ap_free_aqic_resources(q); break; default: WARN(true, @@ -1684,8 +1689,6 @@ retry_zapq: return -EIO; } - vfio_ap_free_aqic_resources(q); - return ret; } -- cgit v1.2.3 From 411b0109daa52d1cc5be39635631e22a5590c5d8 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 15 Aug 2023 14:43:24 -0400 Subject: s390/vfio-ap: wait for response code 05 to clear on queue reset Response code 05, AP busy, is a valid response code for a ZAPQ or TAPQ. Instead of returning error -EIO when a ZAPQ fails with response code 05, let's wait until the queue is no longer busy and try the ZAPQ again. Signed-off-by: Tony Krowiak Acked-by: Janosch Frank Tested-by: Viktor Mihajlovski Link: https://lore.kernel.org/r/20230815184333.6554-4-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens --- drivers/s390/crypto/vfio_ap_ops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index be92ba45226d..3f67cfb53d0c 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1611,6 +1611,7 @@ static int apq_status_check(int apqn, struct ap_queue_status *status) case AP_RESPONSE_DECONFIGURED: return 0; case AP_RESPONSE_RESET_IN_PROGRESS: + case AP_RESPONSE_BUSY: return -EBUSY; default: WARN(true, @@ -1663,6 +1664,7 @@ retry_zapq: } break; case AP_RESPONSE_RESET_IN_PROGRESS: + case AP_RESPONSE_BUSY: /* * There is a reset issued by another process in progress. Let's wait * for that to complete. Since we have no idea whether it was a RAPQ or -- cgit v1.2.3 From c51f8c6bb5c8a4878310d55e3a0b91747954b43d Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 15 Aug 2023 14:43:25 -0400 Subject: s390/vfio-ap: allow deconfigured queue to be passed through to a guest When a queue is reset, the status response code returned from the reset operation is stored in the reset_rc field of the vfio_ap_queue structure representing the queue being reset. This field is later used to decide whether the queue should be passed through to a guest. If the reset_rc field is a non-zero value, the queue will be filtered from the list of queues passed through. When an adapter is deconfigured, all queues associated with that adapter are reset. That being the case, it is not necessary to filter those queues; so, if the status response code returned from a reset operation indicates the queue is deconfigured, the reset_rc field of the vfio_ap_queue structure will be set to zero so it will be passed through (i.e., not filtered). Signed-off-by: Tony Krowiak Reviewed-by: Jason J. Herne Acked-by: Halil Pasic Tested-by: Viktor Mihajlovski Link: https://lore.kernel.org/r/20230815184333.6554-5-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens --- drivers/s390/crypto/vfio_ap_ops.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 3f67cfb53d0c..a489536c508a 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1677,9 +1677,11 @@ retry_zapq: case AP_RESPONSE_DECONFIGURED: /* * When an AP adapter is deconfigured, the associated - * queues are reset, so let's return a value indicating the reset - * completed successfully. + * queues are reset, so let's set the status response code to 0 + * so the queue may be passed through (i.e., not filtered) and + * return a value indicating the reset completed successfully. */ + q->reset_rc = 0; ret = 0; vfio_ap_free_aqic_resources(q); break; -- cgit v1.2.3 From dd174833e44e7717f88f0925b1f78e9ba1d2626e Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 15 Aug 2023 14:43:26 -0400 Subject: s390/vfio-ap: remove upper limit on wait for queue reset to complete The architecture does not define an upper limit on how long a queue reset (RAPQ/ZAPQ) can take to complete. In order to ensure both the security requirements and prevent resource leakage and corruption in the hypervisor, it is necessary to remove the upper limit (200ms) the vfio_ap driver currently waits for a reset to complete. This, of course, may result in a hang which is a less than desirable user experience, but until a firmware solution is provided, this is a necessary evil. Signed-off-by: Tony Krowiak Reviewed-by: Jason J. Herne Acked-by: Halil Pasic Tested-by: Viktor Mihajlovski Link: https://lore.kernel.org/r/20230815184333.6554-6-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens --- drivers/s390/crypto/vfio_ap_ops.c | 64 +++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index a489536c508a..2517868aad56 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -30,7 +30,6 @@ #define AP_QUEUE_UNASSIGNED "unassigned" #define AP_QUEUE_IN_USE "in use" -#define MAX_RESET_CHECK_WAIT 200 /* Sleep max 200ms for reset check */ #define AP_RESET_INTERVAL 20 /* Reset sleep interval (20ms) */ static int vfio_ap_mdev_reset_queues(struct ap_queue_table *qtable); @@ -1622,58 +1621,66 @@ static int apq_status_check(int apqn, struct ap_queue_status *status) } } +#define WAIT_MSG "Waited %dms for reset of queue %02x.%04x (%u, %u, %u)" + static int apq_reset_check(struct vfio_ap_queue *q) { - int ret; - int iters = MAX_RESET_CHECK_WAIT / AP_RESET_INTERVAL; + int ret = -EBUSY, elapsed = 0; struct ap_queue_status status; - for (; iters > 0; iters--) { + while (true) { msleep(AP_RESET_INTERVAL); + elapsed += AP_RESET_INTERVAL; status = ap_tapq(q->apqn, NULL); ret = apq_status_check(q->apqn, &status); - if (ret != -EBUSY) + if (ret == -EIO) return ret; + if (ret == -EBUSY) { + pr_notice_ratelimited(WAIT_MSG, elapsed, + AP_QID_CARD(q->apqn), + AP_QID_QUEUE(q->apqn), + status.response_code, + status.queue_empty, + status.irq_enabled); + } else { + if (q->reset_rc == AP_RESPONSE_RESET_IN_PROGRESS || + q->reset_rc == AP_RESPONSE_BUSY) { + status = ap_zapq(q->apqn, 0); + q->reset_rc = status.response_code; + continue; + } + /* + * When an AP adapter is deconfigured, the associated + * queues are reset, so let's set the status response + * code to 0 so the queue may be passed through (i.e., + * not filtered). + */ + if (q->reset_rc == AP_RESPONSE_DECONFIGURED) + q->reset_rc = 0; + if (q->saved_isc != VFIO_AP_ISC_INVALID) + vfio_ap_free_aqic_resources(q); + break; + } } - WARN_ONCE(iters <= 0, - "timeout verifying reset of queue %02x.%04x (%u, %u, %u)", - AP_QID_CARD(q->apqn), AP_QID_QUEUE(q->apqn), - status.queue_empty, status.irq_enabled, status.response_code); return ret; } static int vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q) { struct ap_queue_status status; - int ret; + int ret = 0; if (!q) return 0; -retry_zapq: status = ap_zapq(q->apqn, 0); q->reset_rc = status.response_code; switch (status.response_code) { case AP_RESPONSE_NORMAL: - ret = 0; - if (!status.irq_enabled) - vfio_ap_free_aqic_resources(q); - if (!status.queue_empty || status.irq_enabled) { - ret = apq_reset_check(q); - if (status.irq_enabled && ret == 0) - vfio_ap_free_aqic_resources(q); - } - break; case AP_RESPONSE_RESET_IN_PROGRESS: case AP_RESPONSE_BUSY: - /* - * There is a reset issued by another process in progress. Let's wait - * for that to complete. Since we have no idea whether it was a RAPQ or - * ZAPQ, then if it completes successfully, let's issue the ZAPQ. - */ + /* Let's verify whether the ZAPQ completed successfully */ ret = apq_reset_check(q); - if (ret) - break; - goto retry_zapq; + break; case AP_RESPONSE_DECONFIGURED: /* * When an AP adapter is deconfigured, the associated @@ -1682,7 +1689,6 @@ retry_zapq: * return a value indicating the reset completed successfully. */ q->reset_rc = 0; - ret = 0; vfio_ap_free_aqic_resources(q); break; default: -- cgit v1.2.3 From 62aab082e9993163731656ce270cd3c1d29079af Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 15 Aug 2023 14:43:27 -0400 Subject: s390/vfio-ap: store entire AP queue status word with the queue object Store the entire AP queue status word returned from the ZAPQ command with the struct vfio_ap_queue object instead of just the response code field. The other information contained in the status word is need by the apq_reset_check function to display a proper message to indicate that the vfio_ap driver is waiting for the ZAPQ to complete because the queue is not empty or IRQs are still enabled. Signed-off-by: Tony Krowiak Tested-by: Viktor Mihajlovski Link: https://lore.kernel.org/r/20230815184333.6554-7-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens --- drivers/s390/crypto/vfio_ap_ops.c | 27 +++++++++++++++------------ drivers/s390/crypto/vfio_ap_private.h | 4 ++-- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 2517868aad56..43224f7a40ea 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -674,7 +674,7 @@ static bool vfio_ap_mdev_filter_matrix(unsigned long *apm, unsigned long *aqm, */ apqn = AP_MKQID(apid, apqi); q = vfio_ap_mdev_get_queue(matrix_mdev, apqn); - if (!q || q->reset_rc) { + if (!q || q->reset_status.response_code) { clear_bit_inv(apid, matrix_mdev->shadow_apcb.apm); break; @@ -1628,6 +1628,7 @@ static int apq_reset_check(struct vfio_ap_queue *q) int ret = -EBUSY, elapsed = 0; struct ap_queue_status status; + memcpy(&status, &q->reset_status, sizeof(status)); while (true) { msleep(AP_RESET_INTERVAL); elapsed += AP_RESET_INTERVAL; @@ -1643,20 +1644,20 @@ static int apq_reset_check(struct vfio_ap_queue *q) status.queue_empty, status.irq_enabled); } else { - if (q->reset_rc == AP_RESPONSE_RESET_IN_PROGRESS || - q->reset_rc == AP_RESPONSE_BUSY) { + if (q->reset_status.response_code == AP_RESPONSE_RESET_IN_PROGRESS || + q->reset_status.response_code == AP_RESPONSE_BUSY) { status = ap_zapq(q->apqn, 0); - q->reset_rc = status.response_code; + memcpy(&q->reset_status, &status, sizeof(status)); continue; } /* - * When an AP adapter is deconfigured, the associated - * queues are reset, so let's set the status response - * code to 0 so the queue may be passed through (i.e., - * not filtered). + * When an AP adapter is deconfigured, the + * associated queues are reset, so let's set the + * status response code to 0 so the queue may be + * passed through (i.e., not filtered) */ - if (q->reset_rc == AP_RESPONSE_DECONFIGURED) - q->reset_rc = 0; + if (status.response_code == AP_RESPONSE_DECONFIGURED) + q->reset_status.response_code = 0; if (q->saved_isc != VFIO_AP_ISC_INVALID) vfio_ap_free_aqic_resources(q); break; @@ -1673,7 +1674,7 @@ static int vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q) if (!q) return 0; status = ap_zapq(q->apqn, 0); - q->reset_rc = status.response_code; + memcpy(&q->reset_status, &status, sizeof(status)); switch (status.response_code) { case AP_RESPONSE_NORMAL: case AP_RESPONSE_RESET_IN_PROGRESS: @@ -1688,7 +1689,8 @@ static int vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q) * so the queue may be passed through (i.e., not filtered) and * return a value indicating the reset completed successfully. */ - q->reset_rc = 0; + q->reset_status.response_code = 0; + ret = 0; vfio_ap_free_aqic_resources(q); break; default: @@ -2042,6 +2044,7 @@ int vfio_ap_mdev_probe_queue(struct ap_device *apdev) q->apqn = to_ap_queue(&apdev->device)->qid; q->saved_isc = VFIO_AP_ISC_INVALID; + memset(&q->reset_status, 0, sizeof(q->reset_status)); matrix_mdev = get_update_locks_by_apqn(q->apqn); if (matrix_mdev) { diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 4642bbdbd1b2..d6eb3527e056 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -133,7 +133,7 @@ struct ap_matrix_mdev { * @apqn: the APQN of the AP queue device * @saved_isc: the guest ISC registered with the GIB interface * @mdev_qnode: allows the vfio_ap_queue struct to be added to a hashtable - * @reset_rc: the status response code from the last reset of the queue + * @reset_status: the status from the last reset of the queue */ struct vfio_ap_queue { struct ap_matrix_mdev *matrix_mdev; @@ -142,7 +142,7 @@ struct vfio_ap_queue { #define VFIO_AP_ISC_INVALID 0xff unsigned char saved_isc; struct hlist_node mdev_qnode; - unsigned int reset_rc; + struct ap_queue_status reset_status; }; int vfio_ap_mdev_register(void); -- cgit v1.2.3 From 9261f0438835a97254590046e1be83733cca440f Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 15 Aug 2023 14:43:28 -0400 Subject: s390/vfio-ap: use work struct to verify queue reset Instead of waiting to verify that a queue is reset in the vfio_ap_mdev_reset_queue function, let's use a wait queue to check the the state of the reset. This way, when resetting all of the queues assigned to a matrix mdev, we don't have to wait for each queue to be reset before initiating a reset on the next queue to be reset. Signed-off-by: Tony Krowiak Reviewed-by: Jason J. Herne Suggested-by: Halil Pasic Acked-by: Janosch Frank Tested-by: Viktor Mihajlovski Link: https://lore.kernel.org/r/20230815184333.6554-8-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens --- drivers/s390/crypto/vfio_ap_ops.c | 48 +++++++++++++++++------------------ drivers/s390/crypto/vfio_ap_private.h | 2 ++ 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 43224f7a40ea..3a59f1c5390f 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -35,7 +35,7 @@ static int vfio_ap_mdev_reset_queues(struct ap_queue_table *qtable); static struct vfio_ap_queue *vfio_ap_find_queue(int apqn); static const struct vfio_device_ops vfio_ap_matrix_dev_ops; -static int vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q); +static void vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q); /** * get_update_locks_for_kvm: Acquire the locks required to dynamically update a @@ -1623,11 +1623,13 @@ static int apq_status_check(int apqn, struct ap_queue_status *status) #define WAIT_MSG "Waited %dms for reset of queue %02x.%04x (%u, %u, %u)" -static int apq_reset_check(struct vfio_ap_queue *q) +static void apq_reset_check(struct work_struct *reset_work) { int ret = -EBUSY, elapsed = 0; struct ap_queue_status status; + struct vfio_ap_queue *q; + q = container_of(reset_work, struct vfio_ap_queue, reset_work); memcpy(&status, &q->reset_status, sizeof(status)); while (true) { msleep(AP_RESET_INTERVAL); @@ -1635,7 +1637,7 @@ static int apq_reset_check(struct vfio_ap_queue *q) status = ap_tapq(q->apqn, NULL); ret = apq_status_check(q->apqn, &status); if (ret == -EIO) - return ret; + return; if (ret == -EBUSY) { pr_notice_ratelimited(WAIT_MSG, elapsed, AP_QID_CARD(q->apqn), @@ -1663,34 +1665,32 @@ static int apq_reset_check(struct vfio_ap_queue *q) break; } } - return ret; } -static int vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q) +static void vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q) { struct ap_queue_status status; - int ret = 0; if (!q) - return 0; + return; status = ap_zapq(q->apqn, 0); memcpy(&q->reset_status, &status, sizeof(status)); switch (status.response_code) { case AP_RESPONSE_NORMAL: case AP_RESPONSE_RESET_IN_PROGRESS: case AP_RESPONSE_BUSY: - /* Let's verify whether the ZAPQ completed successfully */ - ret = apq_reset_check(q); + /* + * Let's verify whether the ZAPQ completed successfully on a work queue. + */ + queue_work(system_long_wq, &q->reset_work); break; case AP_RESPONSE_DECONFIGURED: /* * When an AP adapter is deconfigured, the associated * queues are reset, so let's set the status response code to 0 - * so the queue may be passed through (i.e., not filtered) and - * return a value indicating the reset completed successfully. + * so the queue may be passed through (i.e., not filtered). */ q->reset_status.response_code = 0; - ret = 0; vfio_ap_free_aqic_resources(q); break; default: @@ -1698,29 +1698,25 @@ static int vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q) "PQAP/ZAPQ for %02x.%04x failed with invalid rc=%u\n", AP_QID_CARD(q->apqn), AP_QID_QUEUE(q->apqn), status.response_code); - return -EIO; } - - return ret; } static int vfio_ap_mdev_reset_queues(struct ap_queue_table *qtable) { - int ret, loop_cursor, rc = 0; + int ret = 0, loop_cursor; struct vfio_ap_queue *q; + hash_for_each(qtable->queues, loop_cursor, q, mdev_qnode) + vfio_ap_mdev_reset_queue(q); + hash_for_each(qtable->queues, loop_cursor, q, mdev_qnode) { - ret = vfio_ap_mdev_reset_queue(q); - /* - * Regardless whether a queue turns out to be busy, or - * is not operational, we need to continue resetting - * the remaining queues. - */ - if (ret) - rc = ret; + flush_work(&q->reset_work); + + if (q->reset_status.response_code) + ret = -EIO; } - return rc; + return ret; } static int vfio_ap_mdev_open_device(struct vfio_device *vdev) @@ -2045,6 +2041,7 @@ int vfio_ap_mdev_probe_queue(struct ap_device *apdev) q->apqn = to_ap_queue(&apdev->device)->qid; q->saved_isc = VFIO_AP_ISC_INVALID; memset(&q->reset_status, 0, sizeof(q->reset_status)); + INIT_WORK(&q->reset_work, apq_reset_check); matrix_mdev = get_update_locks_by_apqn(q->apqn); if (matrix_mdev) { @@ -2094,6 +2091,7 @@ void vfio_ap_mdev_remove_queue(struct ap_device *apdev) } vfio_ap_mdev_reset_queue(q); + flush_work(&q->reset_work); dev_set_drvdata(&apdev->device, NULL); kfree(q); release_update_locks_for_mdev(matrix_mdev); diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index d6eb3527e056..88aff8b81f2f 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -134,6 +134,7 @@ struct ap_matrix_mdev { * @saved_isc: the guest ISC registered with the GIB interface * @mdev_qnode: allows the vfio_ap_queue struct to be added to a hashtable * @reset_status: the status from the last reset of the queue + * @reset_work: work to wait for queue reset to complete */ struct vfio_ap_queue { struct ap_matrix_mdev *matrix_mdev; @@ -143,6 +144,7 @@ struct vfio_ap_queue { unsigned char saved_isc; struct hlist_node mdev_qnode; struct ap_queue_status reset_status; + struct work_struct reset_work; }; int vfio_ap_mdev_register(void); -- cgit v1.2.3 From e1f17f8ea93d8fc9d6d0562d38bb0a5fb3e8355e Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 15 Aug 2023 14:43:29 -0400 Subject: s390/vfio-ap: handle queue state change in progress on reset A new APQSW response code (0xA) indicating the designated queue is in the process of being bound or associated to a configuration may be returned from the PQAP(ZAPQ) command. This patch introduces code that will verify when the PQAP(ZAPQ) command can be re-issued after receiving response code 0xA. Signed-off-by: Tony Krowiak Reviewed-by: Jason J. Herne Acked-by: Halil Pasic Tested-by: Viktor Mihajlovski Link: https://lore.kernel.org/r/20230815184333.6554-9-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens --- drivers/s390/crypto/vfio_ap_ops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 3a59f1c5390f..43dea259fe23 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1647,7 +1647,8 @@ static void apq_reset_check(struct work_struct *reset_work) status.irq_enabled); } else { if (q->reset_status.response_code == AP_RESPONSE_RESET_IN_PROGRESS || - q->reset_status.response_code == AP_RESPONSE_BUSY) { + q->reset_status.response_code == AP_RESPONSE_BUSY || + q->reset_status.response_code == AP_RESPONSE_STATE_CHANGE_IN_PROGRESS) { status = ap_zapq(q->apqn, 0); memcpy(&q->reset_status, &status, sizeof(status)); continue; @@ -1679,6 +1680,7 @@ static void vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q) case AP_RESPONSE_NORMAL: case AP_RESPONSE_RESET_IN_PROGRESS: case AP_RESPONSE_BUSY: + case AP_RESPONSE_STATE_CHANGE_IN_PROGRESS: /* * Let's verify whether the ZAPQ completed successfully on a work queue. */ -- cgit v1.2.3 From 7847a19b5b6265f11e71c8499a3b608edac7f398 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 15 Aug 2023 14:43:30 -0400 Subject: s390/vfio-ap: check for TAPQ response codes 0x35 and 0x36 Check for response codes 0x35 and 0x36 which are asynchronous return codes indicating a failure of the guest to associate a secret with a queue. Since there can be no interaction with this queue from the guest (i.e., the vcpus are out of SIE for hot unplug, the guest is being shut down or an emulated subsystem reset of the guest is taking place), let's go ahead and re-issue the ZAPQ to reset and zeroize the queue. Signed-off-by: Tony Krowiak Reviewed-by: Jason J. Herne Reviewed-by: Halil Pasic Tested-by: Viktor Mihajlovski Link: https://lore.kernel.org/r/20230815184333.6554-10-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens --- drivers/s390/crypto/vfio_ap_ops.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 43dea259fe23..8bda52c46df0 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1612,6 +1612,16 @@ static int apq_status_check(int apqn, struct ap_queue_status *status) case AP_RESPONSE_RESET_IN_PROGRESS: case AP_RESPONSE_BUSY: return -EBUSY; + case AP_RESPONSE_ASSOC_SECRET_NOT_UNIQUE: + case AP_RESPONSE_ASSOC_FAILED: + /* + * These asynchronous response codes indicate a PQAP(AAPQ) + * instruction to associate a secret with the guest failed. All + * subsequent AP instructions will end with the asynchronous + * response code until the AP queue is reset; so, let's return + * a value indicating a reset needs to be performed again. + */ + return -EAGAIN; default: WARN(true, "failed to verify reset of queue %02x.%04x: TAPQ rc=%u\n", @@ -1648,7 +1658,8 @@ static void apq_reset_check(struct work_struct *reset_work) } else { if (q->reset_status.response_code == AP_RESPONSE_RESET_IN_PROGRESS || q->reset_status.response_code == AP_RESPONSE_BUSY || - q->reset_status.response_code == AP_RESPONSE_STATE_CHANGE_IN_PROGRESS) { + q->reset_status.response_code == AP_RESPONSE_STATE_CHANGE_IN_PROGRESS || + ret == -EAGAIN) { status = ap_zapq(q->apqn, 0); memcpy(&q->reset_status, &status, sizeof(status)); continue; -- cgit v1.2.3 From cf3fa16a6fd49216ae83502e61bea0d8322b51eb Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Tue, 15 Aug 2023 14:43:31 -0400 Subject: s390/uv: export uv_pin_shared for direct usage Export the uv_pin_shared function so that it can be called from other modules that carry a GPL-compatible license. Signed-off-by: Janosch Frank Signed-off-by: Tony Krowiak Tested-by: Viktor Mihajlovski Link: https://lore.kernel.org/r/20230815184333.6554-11-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/include/asm/uv.h | 6 ++++++ arch/s390/kernel/uv.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index d6bb2f4f78d1..d2cd42bb2c26 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -463,6 +463,7 @@ static inline int is_prot_virt_host(void) return prot_virt_host; } +int uv_pin_shared(unsigned long paddr); int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); int uv_destroy_owned_page(unsigned long paddr); @@ -475,6 +476,11 @@ void setup_uv(void); #define is_prot_virt_host() 0 static inline void setup_uv(void) {} +static inline int uv_pin_shared(unsigned long paddr) +{ + return 0; +} + static inline int uv_destroy_owned_page(unsigned long paddr) { return 0; diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 66f0eb1c872b..b771f1b4cdd1 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -88,7 +88,7 @@ fail: * Requests the Ultravisor to pin the page in the shared state. This will * cause an intercept when the guest attempts to unshare the pinned page. */ -static int uv_pin_shared(unsigned long paddr) +int uv_pin_shared(unsigned long paddr) { struct uv_cb_cfs uvcb = { .header.cmd = UVC_CMD_PIN_PAGE_SHARED, @@ -100,6 +100,7 @@ static int uv_pin_shared(unsigned long paddr) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(uv_pin_shared); /* * Requests the Ultravisor to destroy a guest page and make it -- cgit v1.2.3 From fb5040ef7f707525d0681cf6bfe424ccd1aadab7 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 15 Aug 2023 14:43:32 -0400 Subject: KVM: s390: export kvm_s390_pv*_is_protected functions Export the kvm_s390_pv_is_protected and kvm_s390_pv_cpu_is_protected functions so that they can be called from other modules that carry a GPL-compatible license. Signed-off-by: Janosch Frank Signed-off-by: Tony Krowiak Tested-by: Viktor Mihajlovski Link: https://lore.kernel.org/r/20230815184333.6554-12-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/include/asm/kvm_host.h | 3 +++ arch/s390/kvm/kvm-s390.h | 12 ------------ arch/s390/kvm/pv.c | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 2bbc3d54959d..91bfecb91321 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -1028,6 +1028,9 @@ static inline int sie64a(struct kvm_s390_sie_block *sie_block, u64 *rsa) extern char sie_exit; +bool kvm_s390_pv_is_protected(struct kvm *kvm); +bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu); + extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc); extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 0261d42c7d01..a7ea80cfa445 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -270,18 +270,6 @@ static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu) return vcpu->arch.pv.handle; } -static inline bool kvm_s390_pv_is_protected(struct kvm *kvm) -{ - lockdep_assert_held(&kvm->lock); - return !!kvm_s390_pv_get_handle(kvm); -} - -static inline bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) -{ - lockdep_assert_held(&vcpu->mutex); - return !!kvm_s390_pv_cpu_get_handle(vcpu); -} - /* implemented in interrupt.c */ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 2f34c7c3c5ab..856140e9942e 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -18,6 +18,20 @@ #include #include "kvm-s390.h" +bool kvm_s390_pv_is_protected(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->lock); + return !!kvm_s390_pv_get_handle(kvm); +} +EXPORT_SYMBOL_GPL(kvm_s390_pv_is_protected); + +bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) +{ + lockdep_assert_held(&vcpu->mutex); + return !!kvm_s390_pv_cpu_get_handle(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); + /** * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to * be destroyed -- cgit v1.2.3 From f88fb1335733029b4630fb93cfaad349a81e57b2 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 15 Aug 2023 14:43:33 -0400 Subject: s390/vfio-ap: make sure nib is shared Since the NIB is visible by HW, KVM and the (PV) guest it needs to be in non-secure or secure but shared storage. Return code 6 is used to indicate to a PV guest that its NIB would be on secure, unshared storage and therefore the NIB address is invalid. Unfortunately we have no easy way to check if a page is unshared after vfio_pin_pages() since it will automatically export an unshared page if the UV pin shared call did not succeed due to a page being in unshared state. Therefore we use the fact that UV pinning it a second time is a nop but trying to pin an exported page is an error (0x102). If we encounter this error, we do a vfio unpin and import the page again, since vfio_pin_pages() exported it. Signed-off-by: Janosch Frank Signed-off-by: Tony Krowiak Acked-by: Halil Pasic Tested-by: Viktor Mihajlovski Link: https://lore.kernel.org/r/20230815184333.6554-13-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens --- drivers/s390/crypto/vfio_ap_ops.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 8bda52c46df0..0509f80622cd 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -359,6 +359,28 @@ static int vfio_ap_validate_nib(struct kvm_vcpu *vcpu, dma_addr_t *nib) return 0; } +static int ensure_nib_shared(unsigned long addr, struct gmap *gmap) +{ + int ret; + + /* + * The nib has to be located in shared storage since guest and + * host access it. vfio_pin_pages() will do a pin shared and + * if that fails (possibly because it's not a shared page) it + * calls export. We try to do a second pin shared here so that + * the UV gives us an error code if we try to pin a non-shared + * page. + * + * If the page is already pinned shared the UV will return a success. + */ + ret = uv_pin_shared(addr); + if (ret) { + /* vfio_pin_pages() likely exported the page so let's re-import */ + gmap_convert_to_secure(gmap, addr); + } + return ret; +} + /** * vfio_ap_irq_enable - Enable Interruption for a APQN * @@ -422,6 +444,14 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q, h_nib = page_to_phys(h_page) | (nib & ~PAGE_MASK); aqic_gisa.gisc = isc; + /* NIB in non-shared storage is a rc 6 for PV guests */ + if (kvm_s390_pv_cpu_is_protected(vcpu) && + ensure_nib_shared(h_nib & PAGE_MASK, kvm->arch.gmap)) { + vfio_unpin_pages(&q->matrix_mdev->vdev, nib, 1); + status.response_code = AP_RESPONSE_INVALID_ADDRESS; + return status; + } + nisc = kvm_s390_gisc_register(kvm, isc); if (nisc < 0) { VFIO_AP_DBF_WARN("%s: gisc registration failed: nisc=%d, isc=%d, apqn=%#04x\n", -- cgit v1.2.3 From dedf98dd1cfb61cfc74be9248b90a49b42c6dead Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Tue, 15 Aug 2023 16:08:33 +0800 Subject: s390/pci: use builtin_misc_device macro to simplify the code Use the builtin_misc_device macro to simplify the code, which is the same as declaring with device_initcall(). Signed-off-by: Li Zetao Acked-by: Niklas Schnelle Link: https://lore.kernel.org/r/20230815080833.1103609-1-lizetao1@huawei.com Signed-off-by: Niklas Schnelle Signed-off-by: Heiko Carstens --- arch/s390/pci/pci_clp.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index ee367798e388..ee90a91ed888 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -666,9 +666,4 @@ static struct miscdevice clp_misc_device = { .fops = &clp_misc_fops, }; -static int __init clp_misc_init(void) -{ - return misc_register(&clp_misc_device); -} - -device_initcall(clp_misc_init); +builtin_misc_device(clp_misc_device); -- cgit v1.2.3